#!/usr/bin/perl
# parsegba.pl - parse gba input, write mys output
# GNU (C) Gaspar Sinai
# Tokyo 2002-03-26
print <<EOD;
NAME=gb18030.my
#------------------------------------------------------
COMM=Map sparse Unicode to GB18030 (2000-NOV) and back
COMM=It only has the synchronization points where
COMM=output could be calculated by linear increment.
COMM=Gaspar Sinai <gsinai\@yudit.org> Tokyo 2002-03-26
COMM=
COMM=Here is the algorithm to get GB 18030 codes from this
COMM=garbage:
COMM=If input unicode characters is less than 0x80 emit that code as single byte
COMM=Search for closest lowest key that matches the unicode character:
COMM=
COMM=1) BMP
COMM= 1a If value is greater than 0xffff output will be two bytes:
COMM= value + (unicode_vle - closest_key)
COMM= As always higher byte is emitted first.
COMM= 1b If value is less than 0x8000 output will be four bytes:
COMM= num = linear(value) + (unicode_vle - closest_key);
COMM= num = nonlinear(num);
COMM=
COMM= function nonlinear (num)
COMM= k3 = (num % 10)+0x30; num = num / 10;
COMM= k2 = (num % 126)+0x81; num = num / 126;
COMM= k1 = (num % 10)+0x30; num = num / 10;
COMM= k0 = (num % 126)+0x81;
COMM= return ((k0 << 24) + (k1 << 16) + (k2<<8) + k3);
COMM=
COMM= function linear(value):
COMM= k0 = (value >> 24) & 0xff; // 0x81..0xfe
COMM= k1 = (value >> 16) & 0xff; // 0x30..0x39
COMM= k2 = (value >> 8) & 0xff; // 0x81..0xfe
COMM= k3 = (value >> 0) & 0xff; // 0x30..0x39
COMM= num = (k0-0x81); num = num * 10;
COMM= num += (k1-0x30); num = num * 126;
COMM= num += (k2-0x81); num = num * 10;
COMM= num += (k3-0x30);
COMM= return (num);
COMM=
COMM=2) NON-BMP (unicode_value between 0x10000..0x10FFFF)
COMM= num = unicode_value - 0x10000 + 0x2E248;
COMM= nonlinear (num);
COMM= - 0x10000 should producce 0x90308130
COMM= - 0x10FFFF should be 0xE3329A35
#------------------------------------------------------
TYPE=0
SECTION=encode
ENCODE=1
#
# key 1 for 16 bit (16-bitunicode)
# value 2 for 32 bit (4-byte-gb) values
#
KEY_WIDTH=1
VALUE_WIDTH=2
KEY_LENGTH=0
VALUE_LENGTH=0
#
EOD
$lastvle = 0;
while (<>)
{
chomp;
next unless (/^([0-9A-F]{4})\s+([0-9A-F]{1,8})/);
$key = hex ($1);
$vle = hex ($2);
if ($key == 0)
{
printf ("%04X -> %08X\n", $key, $vle);
next;
}
elsif ($vle==0)
{
next;
}
elsif ($key < 0x80)
{
next;
}
elsif ($vle > 0xffff)
{
$lastvle = &incGB($lastvle);
if ($vle != $lastvle)
{
printf ("%04X -> %08X\n", $key, $vle);
}
$lastvle = $vle;
}
elsif ($vle != ++$lastvle)
{
printf ("%04X -> %08X\n", $key, $vle);
$lastvle = $vle;
}
}
printf ("%04X -> %08X\n", 0xFFFF, 0x8431A439);
# Can not fit - can be checked
#printf ("%04X -> %08X\n", 0x10000, 0x90308130);
#printf ("%04X -> %08X\n", 0x10FFFF, 0xE3329A35);
exit (0);
#
# increment a GB code
#
sub
incGB
{
my $n = &fromGB($_[0]);
$n++;
return (&toGB($n));
}
#
# Convert linear code to GB
#
sub
toGB
{
my $k3 = ($_[0] % 10)+0x30; $_[0] = $_[0] / 10;
my $k2 = ($_[0] % 126)+0x81; $_[0] = $_[0] / 126;
my $k1 = ($_[0] % 10)+0x30; $_[0] = $_[0] / 10;
my $k0 = ($_[0] % 126)+0x81;
return (($k0 << 24) + ($k1 << 16) + ($k2<<8) + $k3);
}
#
# Convert GB to linear code
#
sub
fromGB
{
my $k0 = ($_[0] >> 24) & 0xff;
my $k1 = ($_[0] >> 16) & 0xff;
my $k2 = ($_[0] >> 8) & 0xff;
my $k3 = ($_[0] >> 0) & 0xff;
my $num;
$num = ($k0-0x81); $num = $num * 10;
$num += ($k1-0x30); $num = $num * 126;
$num += ($k2-0x81); $num = $num * 10;
$num += ($k3-0x30);
return ($num);
}