Skip to content

24.8 如何在GB2312与Unicode之间互相转换

Q:

在Windows上有WideCharToMultiByte()、MultiByteToWideChar()等函数可用,Unix 上有类似函数吗。

A: scz

请"man iconv_open",然后动用Google进行相关搜索。之所以没有建议"man iconv", 因为iconv(1)对应着一个命令,Linux上应该看iconv(3),Solaris上应该看iconv(3C), 其它系统各有变化,但"man iconv_open"总是一致的。

我们就iconv*()函数进行了简单测试:


/ * gcc -DLinux -Wall -pipe -O3 -s -o iconv_test iconv_test.c * gcc -DSparc -Wall -pipe -O3 -s -o iconv_test iconv_test.c /

if 0

我的一台FreeBSD测试机上居然没有/usr/include/iconv.h

而另一台AIX测试机上则根本不支持GB2312,编译时需指定库

gcc -DAix -Wall -pipe -O3 -s -o iconv_test iconv_test.c -liconv

显然,在各个缺省安装的Unix系统上iconv()被支持的程度不一,所非自行安装 libiconv,否则这是一个高度不可移植的选择,决定放弃iconv()。

本程序仅为测试目的存在,没有实用价值。

endif

include

include

include

include

include

static void outputBinary ( FILE out, unsigned char byteArray, size_t byteArrayLen ) { size_t offset, k, j, i;

fprintf( out, "byteArray [ %u bytes ] ->\n", ( unsigned int )byteArrayLen );
if ( byteArrayLen <= 0 )
{
    return;
}
i       = 0;
offset  = 0;
for ( k = byteArrayLen / 16; k > 0; k--, offset += 16 )
{
    fprintf( out, "%08X ", ( unsigned int )offset );
    for ( j = 0; j < 16; j++, i++ )
    {
        if ( j == 8 )
        {
            fprintf( out, "-%02X", byteArray[i] );
        }
        else
        {
            fprintf( out, " %02X", byteArray[i] );
        }
    }
    fprintf( out, "    " );
    i  -= 16;
    for ( j = 0; j < 16; j++, i++ )
    {
        /*
         * if ( isprint( (int)byteArray[i] ) )
         */

if 1

        if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] != 0x7F ) && ( byteArray[i] < 0xFF ) )

else

        if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] < 0x7F ) )

endif

        {
            fprintf( out, "%c", byteArray[i] );
        }
        else
        {
            fprintf( out, "." );
        }
    }
    fprintf( out, "\n" );
}  /* end of for */
k       = byteArrayLen - i;
if ( k <= 0 )
{
    return;
}
fprintf( out, "%08X ", ( unsigned int )offset );
for ( j = 0 ; j < k; j++, i++ )
{
    if ( j == 8 )
    {
        fprintf( out, "-%02X", byteArray[i] );
    }
    else
    {
        fprintf( out, " %02X", byteArray[i] );
    }
}
i      -= k;
for ( j = 16 - k; j > 0; j-- )
{
    fprintf( out, "   " );
}
fprintf( out, "    " );
for ( j = 0; j < k; j++, i++ )
{

if 1

    if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] != 0x7F ) && ( byteArray[i] < 0xFF ) )

else

    if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] < 0x7F ) )

endif

    {
        fprintf( out, "%c", byteArray[i] );
    }
    else
    {
        fprintf( out, "." );
    }
}
fprintf( out, "\n" );
return;

} / end of outputBinary /

ifdef Linux

int main ( int argc, char * argv[] ) { int ret = EXIT_FAILURE; iconv_t cd = ( iconv_t )-1; char in[] = "GB2312与Unicode双向转换测试"; size_t inlen = sizeof( in ); char inp = in; char out[sizeof(in)2]; size_t outlen = sizeof( out ); char *outp = out;

/*
 * 大小写不敏感
 */
if ( ( iconv_t )-1 == ( cd = iconv_open( "UCS-2", "GB2312" ) ) )
{
    perror( "iconv_open() error" );
    goto main_exit;
}
outputBinary
(
    stderr,
    in,
    sizeof( in )
);
if ( -1 == iconv( cd, &inp, &inlen, &outp, &outlen ) )
{
    perror( "iconv() error" );
    goto main_exit;
}
outputBinary
(
    stderr,
    out,
    sizeof( out ) - outlen
);
outlen  = sizeof( out ) - outlen;
outp    = out;
inlen   = sizeof( in );
inp     = in;
if ( ( iconv_t )-1 != cd )
{
    iconv_close( cd );
    cd  = ( iconv_t )-1;
}
if ( ( iconv_t )-1 == ( cd = iconv_open( "GB2312", "UCS-2" ) ) )
{
    perror( "iconv_open() error" );
    goto main_exit;
}
if ( -1 == iconv( cd, &outp, &outlen, &inp, &inlen ) )
{
    perror( "iconv() error" );
    goto main_exit;
}
outputBinary
(
    stderr,
    in,
    sizeof( in ) - inlen
);
ret     = EXIT_SUCCESS;

main_exit:

if ( ( iconv_t )-1 != cd )
{
    iconv_close( cd );
    cd  = ( iconv_t )-1;
}
return( ret );

} / end of main /

if 0

[scz@ /home/scz/src]> ./iconv_test byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00 byteArray [ 42 bytes ] -> 00000000 47 00 42 00 32 00 33 00-31 00 32 00 0E 4E 55 00 G.B.2.3.1.2..NU. 00000010 6E 00 69 00 63 00 6F 00-64 00 65 00 CC 53 11 54 n.i.c.o.d.e. 00000020 6C 8F 62 63 4B 6D D5 8B-00 00 byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00

endif

elif defined(Sparc)

int main ( int argc, char * argv[] ) { int ret = EXIT_FAILURE; iconv_t cd = ( iconv_t )-1; char in[] = "GB2312与Unicode双向转换测试"; size_t inlen = sizeof( in ); char inp = in; char out[sizeof(in)2]; size_t outlen = sizeof( out ); char outp = out; char xxx[sizeof(out)]; size_t xxxlen = sizeof( xxx ); char xxxp = xxx;

/*
 * 大小写敏感,SPARC/Solaris上无法直接从"gb2312"转到"UCS-2"
 */
if ( ( iconv_t )-1 == ( cd = iconv_open( "UTF-8", "gb2312" ) ) )
{
    perror( "iconv_open() error" );
    goto main_exit;
}
outputBinary
(
    stderr,
    in,
    sizeof( in )
);
if ( -1 == iconv( cd, ( const char ** )&inp, &inlen, &outp, &outlen ) )
{
    perror( "iconv() error" );
    goto main_exit;
}
outputBinary
(
    stderr,
    out,
    sizeof( out ) - outlen
);
outlen  = sizeof( out ) - outlen;
outp    = out;
if ( ( iconv_t )-1 != cd )
{
    iconv_close( cd );
    cd  = ( iconv_t )-1;
}
if ( ( iconv_t )-1 == ( cd = iconv_open( "UCS-2", "UTF-8" ) ) )
{
    perror( "iconv_open() error" );
    goto main_exit;
}
if ( -1 == iconv( cd, ( const char ** )&outp, &outlen, &xxxp, &xxxlen ) )
{
    perror( "iconv() error" );
    goto main_exit;
}
outputBinary
(
    stderr,
    xxx,
    sizeof( xxx ) - xxxlen
);
xxxlen  = sizeof( xxx ) - xxxlen;
xxxp    = xxx;
outlen  = sizeof( out );
outp    = out;
if ( ( iconv_t )-1 != cd )
{
    iconv_close( cd );
    cd  = ( iconv_t )-1;
}
if ( ( iconv_t )-1 == ( cd = iconv_open( "UTF-8", "UCS-2" ) ) )
{
    perror( "iconv_open() error" );
    goto main_exit;
}
if ( -1 == iconv( cd, ( const char ** )&xxxp, &xxxlen, &outp, &outlen ) )
{
    perror( "iconv() error" );
    goto main_exit;
}
outputBinary
(
    stderr,
    out,
    sizeof( out ) - outlen
);
outlen  = sizeof( out ) - outlen;
outp    = out;
inlen   = sizeof( in );
inp     = in;
if ( ( iconv_t )-1 != cd )
{
    iconv_close( cd );
    cd  = ( iconv_t )-1;
}
if ( ( iconv_t )-1 == ( cd = iconv_open( "gb2312", "UTF-8" ) ) )
{
    perror( "iconv_open() error" );
    goto main_exit;
}
if ( -1 == iconv( cd, ( const char ** )&outp, &outlen, &inp, &inlen ) )
{
    perror( "iconv() error" );
    goto main_exit;
}
outputBinary
(
    stderr,
    in,
    sizeof( in ) - inlen
);
ret     = EXIT_SUCCESS;

main_exit:

if ( ( iconv_t )-1 != cd )
{
    iconv_close( cd );
    cd  = ( iconv_t )-1;
}
return( ret );

} / end of main /

if 0

[scz@ /export/home/scz/src]> ./iconv_test byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode. 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00 byteArray [ 35 bytes ] -> 00000000 47 42 32 33 31 32 E4 B8-8E 55 6E 69 63 6F 64 65 GB2312 00000010 E5 8F 8C E5 90 91 E8 BD-AC E6 8D A2 E6 B5 8B E8 00000020 AF 95 00 byteArray [ 44 bytes ] -> 00000000 FE FF 00 47 00 42 00 32-00 33 00 31 00 32 4E 0E ..G.B.2.3.1.2N. 00000010 00 55 00 6E 00 69 00 63-00 6F 00 64 00 65 53 CC .U.n.i.c.o.d.eS. 00000020 54 11 8F 6C 63 62 6D 4B-8B D5 00 00 T. byteArray [ 35 bytes ] -> 00000000 47 42 32 33 31 32 E4 B8-8E 55 6E 69 63 6F 64 65 GB2312 00000010 E5 8F 8C E5 90 91 E8 BD-AC E6 8D A2 E6 B5 8B E8 00000020 AF 95 00 byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode. 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00

endif

endif


鉴于缺省情况下iconv*()不具有良好的可移植性,可以考虑其它方案。其中一种替代 方案是在.c中预定义相关码表,自行完成查表转换工作。但这种方案要考虑字节序的 问题,参看前面x86/Linux、SPARC/Solaris上的输出信息,也不具有良好可移植性。