Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
hcc18
parabix-devel
Commits
f8fb219b
Commit
f8fb219b
authored
6 years ago
by
Rob Cameron
Browse files
Options
Download
Email Patches
Plain Diff
Refactoring of gb18030 - use index calculation for 2-byte sequences
parent
eb9ae5c4
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
2717 additions
and
2789 deletions
+2717
-2789
UCD-scripts/generate_GB_18030_data.py
UCD-scripts/generate_GB_18030_data.py
+9
-2
icgrep/cc/encodings/GB_18030_data.cpp
icgrep/cc/encodings/GB_18030_data.cpp
+2662
-2774
icgrep/cc/encodings/GB_18030_data.h
icgrep/cc/encodings/GB_18030_data.h
+1
-1
icgrep/gb18030.cpp
icgrep/gb18030.cpp
+34
-12
icgrep/pablo/bixnum.cpp
icgrep/pablo/bixnum.cpp
+10
-0
icgrep/pablo/bixnum.h
icgrep/pablo/bixnum.h
+1
-0
No files found.
UCD-scripts/generate_GB_18030_data.py
View file @
f8fb219b
...
...
@@ -15,6 +15,13 @@ def GB_double_byte_table():
tbl
+=
"};
\n
"
return
tbl
def
GB_double_byte_table
():
idx
=
WHATWG_parser
.
parse_WHATWG_index_file
(
'gb18030'
)
tbl
=
"std::vector<unsigned> GB_DoubleByteTable = {
\n
"
tbl
+=
cformat
.
multiline_fill
([
'0x%04x'
%
idx
[
k
]
for
k
in
sorted
(
idx
.
keys
())],
','
,
4
)
tbl
+=
"};
\n
"
return
tbl
def
GB_range_table
():
idx
=
WHATWG_parser
.
parse_WHATWG_index_file
(
'gb18030-ranges'
)
...
...
@@ -26,13 +33,13 @@ def GB_range_table():
def
generate_GB_data_cpp
():
f
=
cformat
.
open_cpp_file_for_write
(
'GB_18030_data'
)
f
.
write
(
GB_double_byte_table
())
f
.
write
(
"
\n
std::vector<
std::vector<
UCD::codepoint_t>
>
& get_GB_DoubleByteTable() {
\n
return GB_DoubleByteTable;
\n
}
\n
"
)
f
.
write
(
"
\n
std::vector<UCD::codepoint_t> & get_GB_DoubleByteTable() {
\n
return GB_DoubleByteTable;
\n
}
\n
"
)
f
.
write
(
GB_range_table
())
f
.
write
(
"
\n
std::vector<std::pair<unsigned, unsigned>> & get_GB_RangeTable() {
\n
return GB_RangeTable;
\n
}
\n
"
)
f
.
close
()
f
=
cformat
.
open_header_file_for_write
(
'GB_18030_data'
)
cformat
.
write_imports
(
f
,
[
'<vector>'
,
'<UCD/unicode_set.h>'
])
f
.
write
(
"
\n
std::vector<
std::vector<
UCD::codepoint_t>
>
& get_GB_DoubleByteTable();
\n
"
)
f
.
write
(
"
\n
std::vector<UCD::codepoint_t> & get_GB_DoubleByteTable();
\n
"
)
f
.
write
(
"
\n
std::vector<std::pair<unsigned, unsigned>> & get_GB_RangeTable();
\n
"
)
cformat
.
close_header_file
(
f
)
...
...
This diff is collapsed.
Click to expand it.
icgrep/cc/encodings/GB_18030_data.cpp
View file @
f8fb219b
This diff is collapsed.
Click to expand it.
icgrep/cc/encodings/GB_18030_data.h
View file @
f8fb219b
...
...
@@ -11,7 +11,7 @@
#include <UCD/unicode_set.h>
#include <vector>
std
::
vector
<
std
::
vector
<
UCD
::
codepoint_t
>
>
&
get_GB_DoubleByteTable
();
std
::
vector
<
UCD
::
codepoint_t
>
&
get_GB_DoubleByteTable
();
std
::
vector
<
std
::
pair
<
unsigned
,
unsigned
>>
&
get_GB_RangeTable
();
...
...
This diff is collapsed.
Click to expand it.
icgrep/gb18030.cpp
View file @
f8fb219b
...
...
@@ -471,29 +471,51 @@ void GB_18030_CoreLogic::generatePabloMethod() {
std
::
vector
<
PabloAST
*>
byte2_basis
=
getInputStreamSet
(
"byte2_basis"
);
cc
::
Parabix_CC_Compiler_Builder
Byte1_compiler
(
getEntryScope
(),
BixNumArithmetic
(
pb
).
ZeroExtend
(
byte1_basis
,
8
));
// Initialize 16 bit stream variables with ASCII values.
PabloAST
*
zeroes
=
pb
.
createZeroes
();
Var
*
u16
[
16
];
std
::
vector
<
std
::
vector
<
UCD
::
codepoint_t
>>
GB_tbl
=
get_GB_DoubleByteTable
();
for
(
unsigned
i
=
0
;
i
<
BitsPerInputByte
;
++
i
)
{
for
(
unsigned
i
=
0
;
i
<
byte1_basis
.
size
();
++
i
)
{
u16
[
i
]
=
pb
.
createVar
(
"u16"
+
std
::
to_string
(
i
),
pb
.
createAnd
(
ASCII
,
byte1_basis
[
i
]));
}
for
(
unsigned
i
=
BitsPerInputByte
;
i
<
16
;
++
i
)
{
for
(
unsigned
i
=
byte1_basis
.
size
()
;
i
<
16
;
++
i
)
{
u16
[
i
]
=
pb
.
createVar
(
"u16"
+
std
::
to_string
(
i
),
zeroes
);
}
for
(
unsigned
char_code
=
0x81
;
char_code
<
0xFF
;
char_code
++
)
{
// Double byte sequences use a lookup table, with codepoints determined
// according to a calculated index.
std
::
vector
<
UCD
::
codepoint_t
>
GB_tbl
=
get_GB_DoubleByteTable
();
const
unsigned
maxGB2index
=
GB_tbl
.
size
()
-
1
;
// The valid values for the second byte of a 2-byte GB sequence are 0x40-7F and 0x80-0xFE.
// Normalize these values to the range 0 through 190.
BixNum
x80
=
{
byte2_basis
[
7
]};
BixNum
b2
=
BixNumModularArithmetic
(
pb
).
Sub
(
BixNumModularArithmetic
(
pb
).
Sub
(
byte2_basis
,
x80
),
0x40
);
// The valid values for the first byte of a 2-byte GB sequence are 0x81-0xFE. Normalize
// to the range 0-125 as seven-bit value.
BixNum
b1
=
BixNumModularArithmetic
(
pb
).
Sub
(
BixNumArithmetic
(
pb
).
Truncate
(
byte1_basis
,
7
),
0x1
);
// Now compute the GB 2-byte index value: 190 * b1 + b2, as a 15-bit quantity.
BixNum
GB2idx
=
BixNumModularArithmetic
(
pb
).
Add
(
BixNumFullArithmetic
(
pb
).
Mul
(
b1
,
190
),
b2
);
const
unsigned
subTableBits
=
8
;
const
unsigned
subTableSize
=
1
<<
subTableBits
;
BixNum
tblIdxBasis
=
BixNumArithmetic
(
pb
).
HighBits
(
GB2idx
,
GB2idx
.
size
()
-
subTableBits
);
BixNum
subTblBasis
=
BixNumArithmetic
(
pb
).
Truncate
(
GB2idx
,
subTableBits
);
cc
::
Parabix_CC_Compiler_Builder
tblIdxCompiler
(
getEntryScope
(),
BixNumArithmetic
(
pb
).
ZeroExtend
(
tblIdxBasis
,
8
));
for
(
unsigned
tblCode
=
0
;
tblCode
<=
maxGB2index
;
tblCode
+=
subTableSize
)
{
std
::
stringstream
gbpfx
;
gbpfx
<<
"gb_"
<<
std
::
hex
<<
char_code
<<
"_16"
;
PabloAST
*
byte1
=
pb
.
createAnd
(
GB_prefix2
,
Byte1_c
ompiler
.
compileCC
(
makeCC
(
char_code
-
0x80
,
&
cc
::
Byte
)));
gbpfx
<<
"gb_
tbl
"
<<
std
::
hex
<<
(
tblCode
/
subTableSize
)
;
PabloAST
*
tblCodeStrm
=
pb
.
createAnd
(
GB_prefix2
,
tblIdxC
ompiler
.
compileCC
(
makeCC
(
tblCode
/
subTableSize
,
&
cc
::
Byte
)));
PabloBlock
*
inner
=
getEntryScope
()
->
createScope
();
PabloBuilder
nested
(
inner
);
std
::
vector
<
unsigned
>
subTable
=
fullByteTable_GB10830_byte2
(
GB_tbl
[
char_code
-
0x81
],
ReplacementCharacter
);
BixNumTableCompiler
tblComp
(
nested
,
subTable
,
gbpfx
.
str
());
BixNum
outputCode
=
tblComp
.
compileSubTableLookup
(
0
,
255
,
16
,
byte2_basis
);
BixNumTableCompiler
tblComp
(
nested
,
GB_tbl
,
gbpfx
.
str
());
BixNum
outputCode
=
tblComp
.
compileSubTableLookup
(
tblCode
,
std
::
min
(
tblCode
+
subTableSize
-
1
,
maxGB2index
),
16
,
subTblBasis
);
for
(
unsigned
i
=
0
;
i
<
16
;
i
++
)
{
nested
.
createAssign
(
u16
[
i
],
nested
.
createOr
(
u16
[
i
],
nested
.
createAnd
(
byte1
,
outputCode
[
i
])));
nested
.
createAssign
(
u16
[
i
],
nested
.
createOr
(
u16
[
i
],
nested
.
createAnd
(
tblCodeStrm
,
outputCode
[
i
])));
}
pb
.
createIf
(
byte1
,
inner
);
pb
.
createIf
(
tblCodeStrm
,
inner
);
}
Var
*
const
u16_output
=
getOutputStreamVar
(
"u16_basis"
);
for
(
unsigned
i
=
0
;
i
<
16
;
i
++
)
{
...
...
This diff is collapsed.
Click to expand it.
icgrep/pablo/bixnum.cpp
View file @
f8fb219b
...
...
@@ -132,6 +132,16 @@ BixNum BixNumArithmetic::Truncate(BixNum value, unsigned truncated_size) {
return
truncated
;
}
BixNum
BixNumArithmetic
::
HighBits
(
BixNum
value
,
unsigned
highBitCount
)
{
assert
(
highBitCount
<=
value
.
size
());
unsigned
offset
=
value
.
size
()
-
highBitCount
;
BixNum
extracted
(
highBitCount
);
for
(
unsigned
i
=
0
;
i
<
highBitCount
;
i
++
)
{
extracted
[
i
]
=
value
[
i
+
offset
];
}
return
extracted
;
}
PabloAST
*
BixNumArithmetic
::
EQ
(
BixNum
value
,
BixNum
test
)
{
return
mPB
.
createNot
(
NEQ
(
value
,
test
));
}
...
...
This diff is collapsed.
Click to expand it.
icgrep/pablo/bixnum.h
View file @
f8fb219b
...
...
@@ -26,6 +26,7 @@ public:
BixNum
ZeroExtend
(
BixNum
value
,
unsigned
extended_size
);
BixNum
SignExtend
(
BixNum
value
,
unsigned
extended_size
);
BixNum
Truncate
(
BixNum
value
,
unsigned
truncated_size
);
BixNum
HighBits
(
BixNum
value
,
unsigned
highBitCount
);
private:
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment