Refactoring of gb18030 - use index calculation for 2-byte sequences

f8fb219b · Rob Cameron · eb9ae5c4 · f8fb219b · f8fb219b · f8fb219b
Commit f8fb219b authored 6 years ago by Rob Cameron
6 changed files
--- a/UCD-scripts/generate_GB_18030_data.py
+++ b/UCD-scripts/generate_GB_18030_data.py
@@ -15,6 +15,13 @@ def GB_double_byte_table():
            tbl += "};\n"
    return tbl

+def GB_double_byte_table():
+    idx = WHATWG_parser.parse_WHATWG_index_file('gb18030')
+    tbl = "std::vector<unsigned> GB_DoubleByteTable = {\n    "
+    tbl += cformat.multiline_fill(['0x%04x' % idx[k] for k in sorted(idx.keys())], ',', 4)
+    tbl += "};\n"
+    return tbl
+

 def GB_range_table():
    idx = WHATWG_parser.parse_WHATWG_index_file('gb18030-ranges')
@@ -26,13 +33,13 @@ def GB_range_table():
 def generate_GB_data_cpp():
    f = cformat.open_cpp_file_for_write('GB_18030_data')
    f.write(GB_double_byte_table())
-    f.write("\nstd::vector<std::vector<UCD::codepoint_t>> & get_GB_DoubleByteTable() {\n   return GB_DoubleByteTable;\n}\n")
+    f.write("\nstd::vector<UCD::codepoint_t> & get_GB_DoubleByteTable() {\n   return GB_DoubleByteTable;\n}\n")
    f.write(GB_range_table())
    f.write("\nstd::vector<std::pair<unsigned, unsigned>> & get_GB_RangeTable() {\n   return GB_RangeTable;\n}\n")
    f.close()
    f = cformat.open_header_file_for_write('GB_18030_data')
    cformat.write_imports(f, ['<vector>', '<UCD/unicode_set.h>'])
-    f.write("\nstd::vector<std::vector<UCD::codepoint_t>> & get_GB_DoubleByteTable();\n")
+    f.write("\nstd::vector<UCD::codepoint_t> & get_GB_DoubleByteTable();\n")
    f.write("\nstd::vector<std::pair<unsigned, unsigned>> & get_GB_RangeTable();\n")
    cformat.close_header_file(f)


--- a/icgrep/cc/encodings/GB_18030_data.cpp
+++ b/icgrep/cc/encodings/GB_18030_data.cpp
--- a/icgrep/cc/encodings/GB_18030_data.h
+++ b/icgrep/cc/encodings/GB_18030_data.h
@@ -11,7 +11,7 @@
 #include <UCD/unicode_set.h>
 #include <vector>

-std::vector<std::vector<UCD::codepoint_t>> & get_GB_DoubleByteTable();
+std::vector<UCD::codepoint_t> & get_GB_DoubleByteTable();

 std::vector<std::pair<unsigned, unsigned>> & get_GB_RangeTable();


--- a/icgrep/gb18030.cpp
+++ b/icgrep/gb18030.cpp
@@ -471,29 +471,51 @@ void GB_18030_CoreLogic::generatePabloMethod() {
    std::vector<PabloAST *> byte2_basis = getInputStreamSet("byte2_basis");
    
    
-    cc::Parabix_CC_Compiler_Builder Byte1_compiler(getEntryScope(), BixNumArithmetic(pb).ZeroExtend(byte1_basis, 8));
+    // Initialize 16 bit stream variables with ASCII values.
    PabloAST * zeroes = pb.createZeroes();
    Var * u16[16];
-    std::vector<std::vector<UCD::codepoint_t>> GB_tbl = get_GB_DoubleByteTable();
-    for (unsigned i = 0; i < BitsPerInputByte; ++i) {
+    for (unsigned i = 0; i < byte1_basis.size(); ++i) {
        u16[i] = pb.createVar("u16" + std::to_string(i), pb.createAnd(ASCII, byte1_basis[i]));
    }
-    for (unsigned i = BitsPerInputByte; i < 16; ++i) {
+    for (unsigned i = byte1_basis.size(); i < 16; ++i) {
        u16[i] = pb.createVar("u16" + std::to_string(i), zeroes);
    }
-    for (unsigned char_code = 0x81; char_code < 0xFF; char_code++) {
+    
+    //  Double byte sequences use a lookup table, with codepoints determined
+    //  according to a calculated index.
+    
+    std::vector<UCD::codepoint_t> GB_tbl = get_GB_DoubleByteTable();
+    const unsigned maxGB2index = GB_tbl.size()-1;
+
+    // The valid values for the second byte of a 2-byte GB sequence are 0x40-7F and 0x80-0xFE.
+    // Normalize these values to the range 0 through 190.  
+    BixNum x80 = {byte2_basis[7]};
+    BixNum b2 = BixNumModularArithmetic(pb).Sub(BixNumModularArithmetic(pb).Sub(byte2_basis, x80), 0x40);
+
+    // The valid values for the first byte of a 2-byte GB sequence are 0x81-0xFE.  Normalize
+    // to the range 0-125 as seven-bit value.
+    BixNum b1 = BixNumModularArithmetic(pb).Sub(BixNumArithmetic(pb).Truncate(byte1_basis, 7), 0x1);
+    // Now compute the GB 2-byte index value:  190 * b1 + b2, as a 15-bit quantity.
+    BixNum GB2idx = BixNumModularArithmetic(pb).Add(BixNumFullArithmetic(pb).Mul(b1, 190), b2);
+
+    const unsigned subTableBits = 8;
+    const unsigned subTableSize = 1 << subTableBits;
+    BixNum tblIdxBasis = BixNumArithmetic(pb).HighBits(GB2idx, GB2idx.size()-subTableBits);
+    BixNum subTblBasis = BixNumArithmetic(pb).Truncate(GB2idx, subTableBits);
+    cc::Parabix_CC_Compiler_Builder tblIdxCompiler(getEntryScope(), BixNumArithmetic(pb).ZeroExtend(tblIdxBasis, 8));
+
+    for (unsigned tblCode = 0; tblCode <= maxGB2index; tblCode+=subTableSize) {
        std::stringstream gbpfx;
-        gbpfx << "gb_" << std::hex << char_code << "_16";
-        PabloAST * byte1 = pb.createAnd(GB_prefix2, Byte1_compiler.compileCC(makeCC(char_code - 0x80, &cc::Byte)));
+        gbpfx << "gb_tbl" << std::hex << (tblCode/subTableSize);
+        PabloAST * tblCodeStrm = pb.createAnd(GB_prefix2, tblIdxCompiler.compileCC(makeCC(tblCode/subTableSize, &cc::Byte)));
        PabloBlock * inner = getEntryScope()->createScope();
        PabloBuilder nested(inner);
-        std::vector<unsigned> subTable = fullByteTable_GB10830_byte2(GB_tbl[char_code - 0x81], ReplacementCharacter);
-        BixNumTableCompiler tblComp(nested, subTable, gbpfx.str());
-        BixNum outputCode = tblComp.compileSubTableLookup(0, 255, 16, byte2_basis);
+        BixNumTableCompiler tblComp(nested, GB_tbl, gbpfx.str());
+        BixNum outputCode = tblComp.compileSubTableLookup(tblCode, std::min(tblCode + subTableSize -1, maxGB2index), 16, subTblBasis);
        for (unsigned i = 0; i < 16; i++) {
-            nested.createAssign(u16[i], nested.createOr(u16[i], nested.createAnd(byte1, outputCode[i])));
+            nested.createAssign(u16[i], nested.createOr(u16[i], nested.createAnd(tblCodeStrm, outputCode[i])));
        }
-        pb.createIf(byte1, inner);
+        pb.createIf(tblCodeStrm, inner);
    }
    Var * const u16_output = getOutputStreamVar("u16_basis");
    for (unsigned i = 0; i < 16; i++) {

--- a/icgrep/pablo/bixnum.cpp
+++ b/icgrep/pablo/bixnum.cpp
@@ -132,6 +132,16 @@ BixNum BixNumArithmetic::Truncate(BixNum value, unsigned truncated_size) {
    return truncated;
 }

+BixNum BixNumArithmetic::HighBits(BixNum value, unsigned highBitCount) {
+    assert(highBitCount <= value.size());
+    unsigned offset = value.size() - highBitCount;
+    BixNum extracted(highBitCount);
+    for (unsigned i = 0; i < highBitCount; i++) {
+        extracted[i] = value[i + offset];
+    }
+    return extracted;
+}
+
 PabloAST * BixNumArithmetic::EQ(BixNum value, BixNum test) {
    return mPB.createNot(NEQ(value, test));
 }

--- a/icgrep/pablo/bixnum.h
+++ b/icgrep/pablo/bixnum.h
@@ -26,6 +26,7 @@ public:
    BixNum ZeroExtend(BixNum value, unsigned extended_size);
    BixNum SignExtend(BixNum value, unsigned extended_size);
    BixNum Truncate(BixNum value, unsigned truncated_size);
+    BixNum HighBits(BixNum value, unsigned highBitCount);


 private: