1 /* 2 * charset.c - 3 * 4 * Written By: MURAOKA Taro <koron@tka.att.ne.jp> 5 * Last Change: 20-Sep-2009. 6 */ 7 module migemo_d.charset; 8 9 10 private static import core.stdc.stdio; 11 12 enum BUFLEN_DETECT = 4096; 13 14 public enum 15 { 16 CHARSET_NONE = 0, 17 CHARSET_CP932 = 1, 18 CHARSET_EUCJP = 2, 19 CHARSET_UTF8 = 3, 20 } 21 22 public alias charset_proc_char2int = extern (C) nothrow @nogc int function(const (char)*, uint*); 23 public alias charset_proc_int2char = extern (C) nothrow @nogc int function(uint, char*); 24 public alias CHARSET_PROC_CHAR2INT = .charset_proc_char2int; 25 public alias CHARSET_PROC_INT2CHAR = .charset_proc_int2char; 26 27 extern (C) 28 pure nothrow @trusted @nogc @live 29 public int cp932_char2int(const char* in_, uint* out_) 30 31 in 32 { 33 assert(in_ != null); 34 } 35 36 do 37 { 38 if ((((0x81 <= in_[0]) && (in_[0] <= 0x9F)) || ((0xE0 <= in_[0]) && (in_[0] <= 0xF0))) && (((0x40 <= in_[1]) && (in_[1] <= 0x7E)) || ((0x80 <= in_[1]) && (in_[1] <= 0xFC)))) { 39 if (out_ != null) { 40 *out_ = (cast(uint)(in_[0]) << 8) | (cast(uint)(in_[1])); 41 } 42 43 return 2; 44 } else { 45 if (out_ != null) { 46 *out_ = in_[0]; 47 } 48 49 return 1; 50 } 51 } 52 53 extern (C) 54 pure nothrow @trusted @nogc @live 55 public int cp932_int2char(uint in_, char* out_) 56 57 do 58 { 59 if (in_ >= 0x0100) { 60 if (out_ != null) { 61 out_[0] = cast(char)((in_ >> 8) & 0xFF); 62 out_[1] = cast(char)(in_ & 0xFF); 63 } 64 65 return 2; 66 } else { 67 return 0; 68 } 69 } 70 71 pragma(inline, true) 72 pure nothrow @safe @nogc @live 73 bool IS_EUC_RANGE(char c) 74 75 do 76 { 77 return (0xA1 <= c) && (c <= 0xFE); 78 } 79 80 extern (C) 81 pure nothrow @trusted @nogc @live 82 public int eucjp_char2int(const char* in_, uint* out_) 83 84 in 85 { 86 assert(in_ != null); 87 } 88 89 do 90 { 91 if (((in_[0] == 0x8E) && (0xA0 <= in_[1]) && (in_[1] <= 0xDF)) || (.IS_EUC_RANGE(in_[0]) && (.IS_EUC_RANGE(in_[1])))) { 92 if (out_ != null) { 93 *out_ = cast(uint)(in_[0]) << 8 | cast(uint)(in_[1]); 94 } 95 96 return 2; 97 } else { 98 if (out_ != null) { 99 *out_ = in_[0]; 100 } 101 102 return 1; 103 } 104 } 105 106 extern (C) 107 pure nothrow @trusted @nogc @live 108 public int eucjp_int2char(uint in_, char* out_) 109 110 do 111 { 112 /* CP932と内容は同じだが将来JISX0213に対応させるために分離しておく */ 113 if (in_ >= 0x0100) { 114 if (out_ != null) { 115 out_[0] = cast(char)((in_ >> 8) & 0xFF); 116 out_[1] = cast(char)(in_ & 0xFF); 117 } 118 119 return 2; 120 } else { 121 return 0; 122 } 123 } 124 125 pure nothrow @trusted @nogc @live 126 package int utf8_char2int_noascii(const char* in_, uint* out_) 127 128 in 129 { 130 assert(in_ != null); 131 } 132 133 do 134 { 135 int len = 0; 136 uint ch; 137 138 for (ch = in_[0]; ch & 0x80; ch <<= 1) { 139 ++len; 140 } 141 142 /*core.stdc.stdio.printf("len=%d in_=%s\n", len, in_);*/ 143 if (len < 2) { 144 return 0; 145 } 146 147 ch = (ch & 0xFF) >> len; 148 149 for (int i = 1; i < len; ++i) { 150 if ((in_[i] & 0xC0) != 0x80) { 151 return 0; 152 } 153 154 ch <<= 6; 155 ch += in_[i] & 0x3F; 156 } 157 158 /*core.stdc.stdio.printf("len=%d in_=%s ch=%08x\n", len, in_, ch);*/ 159 if (out_ != null) { 160 *out_ = ch; 161 } 162 163 return len; 164 } 165 166 extern (C) 167 pure nothrow @trusted @nogc @live 168 public int utf8_char2int(const (char)* in_, uint* out_) 169 170 in 171 { 172 assert(in_ != null); 173 } 174 175 do 176 { 177 int retval = .utf8_char2int_noascii(in_, out_); 178 179 if (retval) { 180 return retval; 181 } else { 182 if (out_ != null) { 183 *out_ = in_[0]; 184 } 185 186 return 1; 187 } 188 } 189 190 extern (C) 191 pure nothrow @trusted @nogc @live 192 public int utf8_int2char(uint in_, char* out_) 193 194 do 195 { 196 if (in_ < 0x80) { 197 return 0; 198 } 199 200 if (in_ < 0x0800) { 201 if (out_ != null) { 202 out_[0] = cast(char)(0xC0 + (in_ >> 6)); 203 out_[1] = 0x80 + ((in_ >> 0) & 0x3F); 204 } 205 206 return 2; 207 } 208 209 if (in_ < 0x010000) { 210 if (out_ != null) { 211 out_[0] = cast(char)(0xE0 + (in_ >> 12)); 212 out_[1] = 0x80 + ((in_ >> 6) & 0x3F); 213 out_[2] = 0x80 + ((in_ >> 0) & 0x3F); 214 } 215 216 return 3; 217 } 218 219 if (in_ < 0x200000) { 220 if (out_ != null) { 221 out_[0] = cast(char)(0xF0 + (in_ >> 18)); 222 out_[1] = 0x80 + ((in_ >> 12) & 0x3F); 223 out_[2] = 0x80 + ((in_ >> 6) & 0x3F); 224 out_[3] = 0x80 + ((in_ >> 0) & 0x3F); 225 } 226 227 return 4; 228 } 229 230 if (in_ < 0x04000000) { 231 if (out_ != null) { 232 out_[0] = cast(char)(0xF8 + (in_ >> 24)); 233 out_[1] = 0x80 + ((in_ >> 18) & 0x3F); 234 out_[2] = 0x80 + ((in_ >> 12) & 0x3F); 235 out_[3] = 0x80 + ((in_ >> 6) & 0x3F); 236 out_[4] = 0x80 + ((in_ >> 0) & 0x3F); 237 } 238 239 return 5; 240 } else { 241 if (out_ != null) { 242 out_[0] = 0xF8 + (in_ >> 30); 243 out_[1] = 0x80 + ((in_ >> 24) & 0x3F); 244 out_[2] = 0x80 + ((in_ >> 18) & 0x3F); 245 out_[3] = 0x80 + ((in_ >> 12) & 0x3F); 246 out_[4] = 0x80 + ((in_ >> 6) & 0x3F); 247 out_[5] = 0x80 + ((in_ >> 0) & 0x3F); 248 } 249 250 return 6; 251 } 252 } 253 254 extern (C) 255 pure nothrow @trusted @nogc @live 256 public int charset_detect_buf(const char* buf, int len) 257 258 in 259 { 260 assert(buf != null); 261 } 262 263 do 264 { 265 int sjis = 0; 266 int euc = 0; 267 int utf8 = 0; 268 int umode = 0; 269 bool smode = false; 270 bool emode = false; 271 bool ufailed = false; 272 273 for (int i = 0; i < len; ++i) { 274 char c = buf[i]; 275 276 // SJISであるかのチェック 277 if (smode) { 278 if (((0x40 <= c) && (c <= 0x7E)) || ((0x80 <= c) && (c <= 0xFC))) { 279 ++sjis; 280 } 281 282 smode = false; 283 } else if (((0x81 <= c) && (c <= 0x9F)) || ((0xE0 <= c) && (c <= 0xF0))) { 284 smode = true; 285 } 286 287 // EUCであるかのチェック 288 bool eflag = (0xA1 <= c) && (c <= 0xFE); 289 290 if (emode) { 291 if (eflag) { 292 ++euc; 293 } 294 295 emode = false; 296 } else if (eflag) { 297 emode = true; 298 } 299 300 // UTF8であるかのチェック 301 if (!ufailed) { 302 if (umode < 1) { 303 if ((c & 0x80) != 0) { 304 if ((c & 0xE0) == 0xC0) { 305 umode = 1; 306 } else if ((c & 0xF0) == 0xE0) { 307 umode = 2; 308 } else if ((c & 0xF8) == 0xF0) { 309 umode = 3; 310 } else if ((c & 0xFC) == 0xF8) { 311 umode = 4; 312 } else if ((c & 0xFE) == 0xFC) { 313 umode = 5; 314 } else { 315 ufailed = true; 316 --utf8; 317 } 318 } 319 } else { 320 if ((c & 0xC0) == 0x80) { 321 ++utf8; 322 --umode; 323 } else { 324 --utf8; 325 umode = 0; 326 ufailed = true; 327 } 328 } 329 330 if (utf8 < 0) { 331 utf8 = 0; 332 } 333 } 334 } 335 336 // 最終的に一番得点の高いエンコードを返す 337 if ((euc > sjis) && (euc > utf8)) { 338 return .CHARSET_EUCJP; 339 } else if ((!ufailed) && (utf8 > euc) && (utf8 > sjis)) { 340 return .CHARSET_UTF8; 341 } else if ((sjis > euc) && (sjis > utf8)) { 342 return .CHARSET_CP932; 343 } else { 344 return .CHARSET_NONE; 345 } 346 } 347 348 extern (C) 349 nothrow @nogc 350 public void charset_getproc(int charset, .CHARSET_PROC_CHAR2INT* char2int, .CHARSET_PROC_INT2CHAR* int2char) 351 352 do 353 { 354 .CHARSET_PROC_CHAR2INT c2i = null; 355 .CHARSET_PROC_INT2CHAR i2c = null; 356 357 switch (charset) { 358 case .CHARSET_CP932: 359 c2i = &.cp932_char2int; 360 i2c = &.cp932_int2char; 361 362 break; 363 364 case .CHARSET_EUCJP: 365 c2i = &.eucjp_char2int; 366 i2c = &.eucjp_int2char; 367 368 break; 369 370 case .CHARSET_UTF8: 371 c2i = &.utf8_char2int; 372 i2c = &.utf8_int2char; 373 374 break; 375 376 default: 377 break; 378 } 379 380 if (char2int != null) { 381 *char2int = c2i; 382 } 383 384 if (int2char != null) { 385 *int2char = i2c; 386 } 387 } 388 389 extern (C) 390 nothrow @nogc 391 public int charset_detect_file(const (char)* path) 392 393 in 394 { 395 assert(path != null); 396 } 397 398 do 399 { 400 int charset = .CHARSET_NONE; 401 core.stdc.stdio.FILE* fp = core.stdc.stdio.fopen(path, "rt"); 402 403 scope (exit) { 404 if (fp != null) { 405 core.stdc.stdio.fclose(fp); 406 fp = null; 407 } 408 } 409 410 if (fp != null) { 411 char[.BUFLEN_DETECT] buf; 412 size_t len = core.stdc.stdio.fread(&(buf[0]), buf[0].sizeof, buf.length, fp); 413 414 if ((len > 0) && (len <= int.max)) { 415 charset = .charset_detect_buf(&(buf[0]), cast(int)(len)); 416 } 417 } 418 419 return charset; 420 }