#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os

"""
GB2312汉字字符集

一级汉字(按音序排列)：
    第一字节 B0-D7 第二字节 A1-FE 共(0xd7-0xb0+1)*94-5=3755
二级汉字(按偏旁部首排列)：
    第一字节 D8-F7 第二字节 A1-FE 共(0xf7-0xd8+1)*94=3008 
"""
def gen_gb2312():
	if os.path.exists("../data/gb2312-hanzi.utf"):
		return
	f=open('gb2312-hanzi.txt','w')
	i=0xb0
	while i<=0xf7:
		j=0xa1
		while j<=0xfe:
			s=chr(i)+chr(j)
			f.write(s)
			if j&0x0f ==0xf:
				f.write('\n')
			j+=1
		f.write('\n\n')
		i+=1
	f.close()
	# iconv will fail without -c on the 5 characters after '座'
	os.system("iconv -c -f GB2312 -t UTF-8 gb2312-hanzi.txt -o ../data/gb2312-hanzi.utf")
	os.system("rm -f gb2312-hanzi.txt")
	print "File ../data/gb2312-hanzi.utf generated."

def gen_gbk():
	pass

def gen_gb18030():
	pass

def gen_unicode_han():
	han = [ (0x2e80, 26),
		(0x2e9b, 89),
		(0x2f00, 214),
		(0x3005, 1),
		(0x3007, 1),
		(0x3021, 9),
		(0x3038, 4),
		(0x3400, 6582),
		(0x4e00, 20924),
		(0xf900, 302),
		(0xfa30, 59),
		(0xfa70, 106),
		(0x20000, 42711),
		(0x2f800, 542), ]

def gen_big5():
	pass

if __name__ == "__main__":
	gen_gb2312()
	gen_gbk()
	gen_gb18030()
	gen_unicode_han()
	gen_big5()
