|
|
发表于 2006-2-11 00:17:50
|
显示全部楼层
算你运气好!
应该说gb2312跟utf-8的概念应该不是一个层次上的。gb2312跟unicode都是字符的编码方式,而utf8只是unicode的一种表示方式而已。他们的区别主要在iso8859-1定义的字符集之外(用多字节来表示一个字符)。在linux下有一个iconv库可以完成这些编码之间的转换!
下面的代码是gb2312字节流跟跟utf16字节流之间的转换函数。gb2312跟utf8的转换应该差不多!
两个extern函数为转换入口函数,其它都为内部函数。
- /*@author:huangh
- *@create time:2005/12/16
- */
- #include <stdio.h>
- #include <iconv.h>
- #include <errno.h>
- #include <stdlib.h>
- #include <string.h>
- #define ALIVE() printf("still alive here [%s]:[%d]\n",__FUNCTION__, __LINE__)
- /*get the low 8 bits of an unsigned short*/
- #define LOWER_SHORT(num) (0x0ff&num)
- /*get the high 8 bits of an unsigned short*/
- #define HIGH_SHORT(num) ((0xff00&num)>>8)
- //#define _BIG_ENDIAN
- //#define DEBUG
- typedef unsigned int char_16;
- static int char16len(char_16 *in_char)
- {
- int ret = 0;
- while(in_char[ret])
- {
- ret++;
- }
- return ret;
- }
- static int gb2312_to_utf16(char *inchar, size_t *in_size, char *outchar, size_t *out_size)
- {
- iconv_t cd;
- char **ppinchar = &inchar;
- char **ppoutchar = &outchar;
-
- if ( ( iconv_t )-1 == ( cd = iconv_open( "UCS-2", "GB2312" ) ) )
- {
- perror( "iconv_open() error" );
- return -1;
- }
-
- if ( -1 == iconv( cd, ppinchar, in_size, ppoutchar, out_size ) )
- {
- perror( "iconv() error" );
- goto error;
- }
- {
- int i = 0;
- char *p = outchar;
- while(p[i])
- {
- printf("%2x:", p[i]);
- i++;
- }
- }
- iconv_close(cd);
- return 0;
- error:
- iconv_close(cd);
- return -1;
- }
- static int char_to_char16(char *in, size_t in_size, char_16 out[], size_t out_size)
- {
- int i = 0;
- unsigned short high, low;
-
- printf("insize is [%d], out size is [%d]\n", in_size, out_size);
- if(in_size > 2*out_size)
- {
- return -1;
- }
- ALIVE();
- for(; i<out_size; i++)
- {
- #ifdef _BIG_ENDIAN
- high = in[2*i+1]&0xff;
- low = in[2*i]&0xff;
- #else
- high = in[2*i]&0xff;
- low = in[2*i+1]&0xff;
- #endif
- out[i] = (high<<8)|low;
- }
- return 0;
- }
- static int char16_to_char(char_16 in[], size_t size_in, char *out, size_t size_out)
- {
- int i = 0;
- if(size_out < 2*size_in)
- {
- return -1;
- }
- for(; i< size_in; i++)
- {
- #ifdef _BIG_ENDIAN
- out[i*2] = LOWER_SHORT(in[i]);
- out[i*2 + 1] = HIGH_SHORT(in[i]);
- #else
- out[i*2] = HIGH_SHORT(in[i]);
- out[i*2 + 1] = LOWER_SHORT(in[i]);
- #endif
- }
- return 0;
- }
- static int utf16_to_gb2312(char *inchar, size_t *in_size, char *outchar, size_t *out_size)
- {
- iconv_t cd;
- char **ppinchar = &inchar;
- char **ppoutchar = &outchar;
-
- if ( ( iconv_t )-1 == ( cd = iconv_open( "GB2312", "UCS-2") ) )
- {
- perror( "iconv_open() error" );
- return -1;
- }
-
- if ( -1 == iconv( cd, ppinchar, in_size, ppoutchar, out_size ) )
- {
- perror( "iconv() error" );
- goto error;
- }
- iconv_close(cd);
- return 0;
- error:
- iconv_close(cd);
- return -1;
- }
- extern int char_to_utf16(char *in_char, char_16 *out_wchar, size_t out_size)
- {
- size_t in_len = strlen(in_char) + 1;
- size_t tmp_len = 2*in_len;
- size_t tmp_len_org;
- char *tmp;
-
- printf("inlen is [%d], out size is [%d]\n", in_len, out_size);
- if(in_len > 2*out_size)
- {
- return -1;
- }
- if(NULL == (tmp = (char *)malloc(tmp_len)))
- {
- return -1;
- }
- tmp_len_org = tmp_len;
-
- if( 0 != gb2312_to_utf16(in_char, &in_len, tmp, &tmp_len))
- {
- goto error;
- }
- {
- printf("tmp length is [%d]:[%d]\n", tmp_len, in_len);
- }
- if(0 != char_to_char16(tmp, tmp_len_org - tmp_len, out_wchar, out_size))
- {
- goto error;
- }
-
- ALIVE();
- free(tmp);
- return 0;
- error:
- free(tmp);
- return -1;
- }
- extern int utf16_to_char(char_16 *in_char, char *out_char, size_t out_size)
- {
- size_t in_size = char16len(in_char) + 1;
- char *tmp;
- size_t in_len ;
- size_t out_len;
- #ifdef DEBUG
- int j;
- for(j=0;j<in_size;j++)
- {
- fprintf(stderr, "[%4x]:",in_char[j]);
- }
- fprintf(stderr, "\n");
- #endif
- if(NULL == (tmp = (char *)malloc(2*in_size)))
- {
- return -1;
- }
-
- memset(tmp, 0, 2*in_size);
- if(0 != char16_to_char(in_char, in_size, tmp, 2*in_size))
- {
- goto error;
- }
- #ifdef DEBUG
- for(j=0; j<2*in_size; j++)
- {
- fprintf(stderr, "[%2x]:",tmp[j]);
- }
- fprintf(stderr, "\n");
- #endif
- in_len = in_size*2;
- out_len = out_size;
- if( 0 != (utf16_to_gb2312(tmp, &in_len, out_char, &out_len)))
- {
- goto error;
- }
-
-
- free(tmp);
- return 0;
- error:
- free(tmp);
- return -1;
- }
复制代码
另外还有一些资料我明天贴上来!
补充一些文档!
1、什么是UTF-8?它与UNICODE是什么关系?
解答:
Unicode的最初目标,是用1个16位的编码来为超过65000个字符提供映射。但这还不够,它不能覆盖全部历史上的文字,也不能解决传输的问题 (implantation head-ache's),尤其在那些基于网络的应用中。已有的软件必须做大量的工作来实现16位的数据。
因此,Unicode用一些基本的保留字符制定了三套编码方式。它们分别是UTF-8,UTF-16和UTF-32。正如名字所示,在UTF-8中,字符是以8位序列来编码的,用一个或几个字节来表示一个字符。这种方式的最大好处,是UTF-8保留了ASCII字符的编码做为它的一部分,例如,在UTF- 8和ASCII中,“A”的编码都是0x41. UTF-16和UTF-32分别是Unicode的16位和32位编码方式。考虑到最初的目的,通常说的Unicode就是指UTF-16。
2、unicode的参考网站
http://www.unicode.org/unicode/standard/principles.html.
3、unicode三种编码之间的转换
http://www.unicode.org/Public/PROGRAMS/CVTUTF/
4、也可以通过查表的方式转换gb2312码和unicode码。表文件见附件gb2312.txt,文件中有使用说明。 |
本帖子中包含更多资源
您需要 登录 才可以下载或查看,没有帐号?注册
x
|