中国語
簡体字、繁体字の両方を扱う機会があったのでまず関連するエンコーディングモジュールをテスト。
#!/usr/bin/perl use strict; use warnings; use Encode; open(OUT,">:encoding(utf8)","test-encode.txt") or die "test-encode.txt $!\n"; our @bad_unicode=( 0xfdd0,0xfdd1,0xfdd2,0xfdd3,0xfdd4,0xfdd5,0xfdd6,0xfdd7, 0xfdd8,0xfdd9,0xfdda,0xfddb,0xfddc,0xfddd,0xfdde,0xfddf, 0xfde0,0xfde1,0xfde2,0xfde3,0xfde4,0xfde5,0xfde6,0xfde7, 0xfde8,0xfde9,0xfdea,0xfdeb,0xfdec,0xfded,0xfdee,0xfdef, 0xfffe,0xffff, ); sub check{ my($list)=@_; my @list = map{ {name=>$_,count=>0}}@$list; for(@list){ my $n = Encode::resolve_alias($_->{name}); $n or die "bad $_->{name}\n"; $_->{enc} = Encode::find_encoding($n); $_->{enc} or die "cant find encoding for $n $_->{name}\n"; } my $all_ok=0; for my $cord (0..65535){ # サロゲートとその他まずい文字は処理しない next if ($cord>=0xD000 and $cord<=0xDFFF) or grep{$cord ==$_} @bad_unicode; my $src = chr($cord); my $ok=0; for(@list){ $_->{e} = Encode::encode($_->{enc},$src); if(not defined($_->{e}) or not length($_->{e}) ){ $_->{e}='null'; $_->{escape}='null'; $_->{u} = 'null'; }else{ $_->{escape}= $_->{e}; $_->{escape}=~s/(.)/'%' . unpack('H2', $1)/eg; $_->{u} = Encode::decode($_->{enc},$_->{e}); if( $_->{u} eq $src ){ ++$ok; ++$_->{count}; } } } if($ok==@list){ ++$all_ok; next; }elsif($ok==0){ next; } print OUT sprintf("[%x]%s",$cord,$src),join(' ',map{"$_->{name}=$_->{escape}"}@list),"\n"; } warn "all_ok=$all_ok ",join(' ',map{"$_->{name}=$_->{count}"}@list),"\n"; } my @cn =qw( euc-cn cp936 ); my @tw =qw( cp950 big5-hkscs big5-eten); check(\@cn); check(\@tw);
結果
all_ok=7572 euc-cn=7573 cp936=24070 all_ok=13664 cp950=19840 big5-hkscs=18468 big5-eten=14025
まとめ