1 /*
2  * charset.c -
3  *
4  * Written By:  MURAOKA Taro <koron@tka.att.ne.jp>
5  * Last Change: 20-Sep-2009.
6  */
7 module migemo_d.charset;
8 
9 
10 private static import core.stdc.stdio;
11 
12 enum BUFLEN_DETECT = 4096;
13 
14 public enum
15 {
16 	CHARSET_NONE = 0,
17 	CHARSET_CP932 = 1,
18 	CHARSET_EUCJP = 2,
19 	CHARSET_UTF8 = 3,
20 }
21 	
22 public alias charset_proc_char2int = extern (C) nothrow @nogc int function(const (char)*, uint*);
23 public alias charset_proc_int2char = extern (C) nothrow @nogc int function(uint, char*);
24 public alias CHARSET_PROC_CHAR2INT = .charset_proc_char2int;
25 public alias CHARSET_PROC_INT2CHAR = .charset_proc_int2char;
26 
27 extern (C)
28 pure nothrow @trusted @nogc @live
29 public int cp932_char2int(const char* in_, uint* out_)
30 
31 	in
32 	{
33 		assert(in_ != null);
34 	}
35 
36 	do
37 	{
38 		if ((((0x81 <= in_[0]) && (in_[0] <= 0x9F)) || ((0xE0 <= in_[0]) && (in_[0] <= 0xF0))) && (((0x40 <= in_[1]) && (in_[1] <= 0x7E)) || ((0x80 <= in_[1]) && (in_[1] <= 0xFC)))) {
39 			if (out_ != null) {
40 				*out_ = (cast(uint)(in_[0]) << 8) | (cast(uint)(in_[1]));
41 			}
42 
43 			return 2;
44 		} else {
45 			if (out_ != null) {
46 				*out_ = in_[0];
47 			}
48 
49 			return 1;
50 		}
51 	}
52 
53 extern (C)
54 pure nothrow @trusted @nogc @live
55 public int cp932_int2char(uint in_, char* out_)
56 
57 	do
58 	{
59 		if (in_ >= 0x0100) {
60 			if (out_ != null) {
61 				out_[0] = cast(char)((in_ >> 8) & 0xFF);
62 				out_[1] = cast(char)(in_ & 0xFF);
63 			}
64 
65 			return 2;
66 		} else {
67 			return 0;
68 		}
69 	}
70 
71 pragma(inline, true)
72 pure nothrow @safe @nogc @live
73 bool IS_EUC_RANGE(char c)
74 
75 	do
76 	{
77 		return (0xA1 <= c) && (c <= 0xFE);
78 	}
79 
80 extern (C)
81 pure nothrow @trusted @nogc @live
82 public int eucjp_char2int(const char* in_, uint* out_)
83 
84 	in
85 	{
86 		assert(in_ != null);
87 	}
88 
89 	do
90 	{
91 		if (((in_[0] == 0x8E) && (0xA0 <= in_[1]) && (in_[1] <= 0xDF)) || (.IS_EUC_RANGE(in_[0]) && (.IS_EUC_RANGE(in_[1])))) {
92 			if (out_ != null) {
93 				*out_ = cast(uint)(in_[0]) << 8 | cast(uint)(in_[1]);
94 			}
95 
96 			return 2;
97 		} else {
98 			if (out_ != null) {
99 				*out_ = in_[0];
100 			}
101 
102 			return 1;
103 		}
104 	}
105 
106 extern (C)
107 pure nothrow @trusted @nogc @live
108 public int eucjp_int2char(uint in_, char* out_)
109 
110 	do
111 	{
112 		/* CP932と内容は同じだが将来JISX0213に対応させるために分離しておく */
113 		if (in_ >= 0x0100) {
114 			if (out_ != null) {
115 				out_[0] = cast(char)((in_ >> 8) & 0xFF);
116 				out_[1] = cast(char)(in_ & 0xFF);
117 			}
118 
119 			return 2;
120 		} else {
121 			return 0;
122 		}
123 	}
124 
125 pure nothrow @trusted @nogc @live
126 package int utf8_char2int_noascii(const char* in_, uint* out_)
127 
128 	in
129 	{
130 		assert(in_ != null);
131 	}
132 
133 	do
134 	{
135 		int len = 0;
136 		uint ch;
137 
138 		for (ch = in_[0]; ch & 0x80; ch <<= 1) {
139 			++len;
140 		}
141 
142 		/*core.stdc.stdio.printf("len=%d in_=%s\n", len, in_);*/
143 		if (len < 2) {
144 			return 0;
145 		}
146 
147 		ch = (ch & 0xFF) >> len;
148 
149 		for (int i = 1; i < len; ++i) {
150 			if ((in_[i] & 0xC0) != 0x80) {
151 				return 0;
152 			}
153 
154 			ch <<= 6;
155 			ch += in_[i] & 0x3F;
156 		}
157 
158 		/*core.stdc.stdio.printf("len=%d in_=%s ch=%08x\n", len, in_, ch);*/
159 		if (out_ != null) {
160 			*out_ = ch;
161 		}
162 
163 		return len;
164 	}
165 
166 extern (C)
167 pure nothrow @trusted @nogc @live
168 public int utf8_char2int(const (char)* in_, uint* out_)
169 
170 	in
171 	{
172 		assert(in_ != null);
173 	}
174 
175 	do
176 	{
177 		int retval = .utf8_char2int_noascii(in_, out_);
178 
179 		if (retval) {
180 			return retval;
181 		} else {
182 			if (out_ != null) {
183 				*out_ = in_[0];
184 			}
185 
186 			return 1;
187 		}
188 	}
189 
190 extern (C)
191 pure nothrow @trusted @nogc @live
192 public int utf8_int2char(uint in_, char* out_)
193 
194 	do
195 	{
196 		if (in_ < 0x80) {
197 			return 0;
198 		}
199 
200 		if (in_ < 0x0800) {
201 			if (out_ != null) {
202 				out_[0] = cast(char)(0xC0 + (in_ >> 6));
203 				out_[1] = 0x80 + ((in_ >> 0) & 0x3F);
204 			}
205 
206 			return 2;
207 		}
208 
209 		if (in_ < 0x010000) {
210 			if (out_ != null) {
211 				out_[0] = cast(char)(0xE0 + (in_ >> 12));
212 				out_[1] = 0x80 + ((in_ >> 6) & 0x3F);
213 				out_[2] = 0x80 + ((in_ >> 0) & 0x3F);
214 			}
215 
216 			return 3;
217 		}
218 
219 		if (in_ < 0x200000) {
220 			if (out_ != null) {
221 				out_[0] = cast(char)(0xF0 + (in_ >> 18));
222 				out_[1] = 0x80 + ((in_ >> 12) & 0x3F);
223 				out_[2] = 0x80 + ((in_ >> 6) & 0x3F);
224 				out_[3] = 0x80 + ((in_ >> 0) & 0x3F);
225 			}
226 
227 			return 4;
228 		}
229 
230 		if (in_ < 0x04000000) {
231 			if (out_ != null) {
232 				out_[0] = cast(char)(0xF8 + (in_ >> 24));
233 				out_[1] = 0x80 + ((in_ >> 18) & 0x3F);
234 				out_[2] = 0x80 + ((in_ >> 12) & 0x3F);
235 				out_[3] = 0x80 + ((in_ >> 6) & 0x3F);
236 				out_[4] = 0x80 + ((in_ >> 0) & 0x3F);
237 			}
238 
239 			return 5;
240 		} else {
241 			if (out_ != null) {
242 				out_[0] = 0xF8 + (in_ >> 30);
243 				out_[1] = 0x80 + ((in_ >> 24) & 0x3F);
244 				out_[2] = 0x80 + ((in_ >> 18) & 0x3F);
245 				out_[3] = 0x80 + ((in_ >> 12) & 0x3F);
246 				out_[4] = 0x80 + ((in_ >> 6) & 0x3F);
247 				out_[5] = 0x80 + ((in_ >> 0) & 0x3F);
248 			}
249 
250 			return 6;
251 		}
252 	}
253 
254 extern (C)
255 pure nothrow @trusted @nogc @live
256 public int charset_detect_buf(const char* buf, int len)
257 
258 	in
259 	{
260 		assert(buf != null);
261 	}
262 
263 	do
264 	{
265 		int sjis = 0;
266 		int euc = 0;
267 		int utf8 = 0;
268 		int umode = 0;
269 		bool smode = false;
270 		bool emode = false;
271 		bool ufailed = false;
272 
273 		for (int i = 0; i < len; ++i) {
274 			char c = buf[i];
275 
276 			// SJISであるかのチェック
277 			if (smode) {
278 				if (((0x40 <= c) && (c <= 0x7E)) || ((0x80 <= c) && (c <= 0xFC))) {
279 					++sjis;
280 				}
281 
282 				smode = false;
283 			} else if (((0x81 <= c) && (c <= 0x9F)) || ((0xE0 <= c) && (c <= 0xF0))) {
284 				smode = true;
285 			}
286 
287 			// EUCであるかのチェック
288 			bool eflag = (0xA1 <= c) && (c <= 0xFE);
289 
290 			if (emode) {
291 				if (eflag) {
292 					++euc;
293 				}
294 
295 				emode = false;
296 			} else if (eflag) {
297 				emode = true;
298 			}
299 
300 			// UTF8であるかのチェック
301 			if (!ufailed) {
302 				if (umode < 1) {
303 					if ((c & 0x80) != 0) {
304 						if ((c & 0xE0) == 0xC0) {
305 							umode = 1;
306 						} else if ((c & 0xF0) == 0xE0) {
307 							umode = 2;
308 						} else if ((c & 0xF8) == 0xF0) {
309 							umode = 3;
310 						} else if ((c & 0xFC) == 0xF8) {
311 							umode = 4;
312 						} else if ((c & 0xFE) == 0xFC) {
313 							umode = 5;
314 						} else {
315 							ufailed = true;
316 							--utf8;
317 						}
318 					}
319 				} else {
320 					if ((c & 0xC0) == 0x80) {
321 						++utf8;
322 						--umode;
323 					} else {
324 						--utf8;
325 						umode = 0;
326 						ufailed = true;
327 					}
328 				}
329 
330 				if (utf8 < 0) {
331 					utf8 = 0;
332 				}
333 			}
334 		}
335 
336 		// 最終的に一番得点の高いエンコードを返す
337 		if ((euc > sjis) && (euc > utf8)) {
338 			return .CHARSET_EUCJP;
339 		} else if ((!ufailed) && (utf8 > euc) && (utf8 > sjis)) {
340 			return .CHARSET_UTF8;
341 		} else if ((sjis > euc) && (sjis > utf8)) {
342 			return .CHARSET_CP932;
343 		} else {
344 			return .CHARSET_NONE;
345 		}
346 	}
347 
348 extern (C)
349 nothrow @nogc
350 public void charset_getproc(int charset, .CHARSET_PROC_CHAR2INT* char2int, .CHARSET_PROC_INT2CHAR* int2char)
351 
352 	do
353 	{
354 		.CHARSET_PROC_CHAR2INT c2i = null;
355 		.CHARSET_PROC_INT2CHAR i2c = null;
356 
357 		switch (charset) {
358 			case .CHARSET_CP932:
359 				c2i = &.cp932_char2int;
360 				i2c = &.cp932_int2char;
361 
362 				break;
363 
364 			case .CHARSET_EUCJP:
365 				c2i = &.eucjp_char2int;
366 				i2c = &.eucjp_int2char;
367 
368 				break;
369 
370 			case .CHARSET_UTF8:
371 				c2i = &.utf8_char2int;
372 				i2c = &.utf8_int2char;
373 
374 				break;
375 
376 			default:
377 				break;
378 		}
379 
380 		if (char2int != null) {
381 			*char2int = c2i;
382 		}
383 
384 		if (int2char != null) {
385 			*int2char = i2c;
386 		}
387 	}
388 
389 extern (C)
390 nothrow @nogc
391 public int charset_detect_file(const (char)* path)
392 
393 	in
394 	{
395 		assert(path != null);
396 	}
397 
398 	do
399 	{
400 		int charset = .CHARSET_NONE;
401 		core.stdc.stdio.FILE* fp = core.stdc.stdio.fopen(path, "rt");
402 
403 		scope (exit) {
404 			if (fp != null) {
405 				core.stdc.stdio.fclose(fp);
406 				fp = null;
407 			}
408 		}
409 
410 		if (fp != null) {
411 			char[.BUFLEN_DETECT] buf;
412 			size_t len = core.stdc.stdio.fread(&(buf[0]), buf[0].sizeof, buf.length, fp);
413 
414 			if ((len > 0) && (len <= int.max)) {
415 				charset = .charset_detect_buf(&(buf[0]), cast(int)(len));
416 			}
417 		}
418 
419 		return charset;
420 	}