... | ... | @@ -14,22 +14,22 @@ Record_separators: ........................1............................1 |
|
|
|
|
|
```
|
|
|
|
|
|
The Parabix `FilterByMask` operation can do this for us, if we set up a mask stream that selects all of the data except the second column and its following comma.
|
|
|
The Parabix `FilterByMask` operation can do this for us, if we set up a mask stream that selects all of the data except the second column and its preceding comma.
|
|
|
|
|
|
```
|
|
|
Data stream: Henderson,Paul,ph@sfu.ca⏎Lin,Qingshan,1234@zju.edu.cn⏎
|
|
|
To keep: 1111111111.....11111111111111.........1111111111111111
|
|
|
To keep: 111111111.....11111111111111.........11111111111111111
|
|
|
|
|
|
```
|
|
|
|
|
|
How do we calculate this mask? With the following set of operations using a
|
|
|
`PabloBuilder pb`.
|
|
|
```
|
|
|
PabloAST * F1start = pb.createNot(pb.createAdvance(pb.createNot(record_separators), 1);
|
|
|
PabloAST * F1start = pb.createNot(pb.createAdvance(pb.createNot(Record_separators), 1);
|
|
|
PabloAST * F1follow = pb.createScanTo(F1start, Field_separators);
|
|
|
PabloAST * F2start = pb.createAdvance(F1start, 1);
|
|
|
PabloAST * F2follow = pb.createScanTo(F2start, Field_separators);
|
|
|
PabloAST * toDelete = pb.createIntrinsicCall(pablo::Intrinsic::InclusiveSpan, {F2start, F2follow});
|
|
|
PabloAST * toDelete = pb.createIntrinsicCall(pablo::Intrinsic::ExclusiveSpan, {F1follow, F2follow});
|
|
|
PabloAST * toKeep = pb.createNot(toDelete);
|
|
|
```
|
|
|
|
... | ... | @@ -39,7 +39,49 @@ F1start: 1........................1............................ |
|
|
F1follow: .........1..................1.........................
|
|
|
F2start: ..........1..................1........................
|
|
|
F2follow: ..............1......................1................
|
|
|
toDelete: ..........11111..............111111111................
|
|
|
toKeep: 1111111111.....11111111111111.........1111111111111111
|
|
|
toDelete: .........11111..............111111111.................
|
|
|
toKeep: 111111111.....11111111111111.........11111111111111111
|
|
|
```
|
|
|
|
|
|
A Pablo Kernel to create this mask can be created as follows.
|
|
|
```
|
|
|
MaskOutField2::MaskOutField2(BuilderRef b, StreamSet * Record_separators,
|
|
|
StreamSet * Field_separators,
|
|
|
StreamSet * toKeep)
|
|
|
: PabloKernel(b, "MaskOutField2",
|
|
|
{Binding{"Record_separators", Record_separators},
|
|
|
Binding{"Field_separators", Field_separators}},
|
|
|
{Binding{"toKeep", toKeep}}) {}
|
|
|
|
|
|
void MaskOutField2::generatePabloMethod() {
|
|
|
PabloBuilder pb(getEntryScope());
|
|
|
Var * Record_separators = pb.createExtract(getInputStreamVar("Record_separators"), pb.getInteger(0));
|
|
|
Var * Field_separators = pb.createExtract(getInputStreamVar("Field_separators"), pb.getInteger(0));
|
|
|
PabloAST * F1start = pb.createNot(pb.createAdvance(pb.createNot(Record_separators), 1);
|
|
|
PabloAST * F1follow = pb.createScanTo(F1start, Field_separators);
|
|
|
PabloAST * F2start = pb.createAdvance(F1start, 1);
|
|
|
PabloAST * F2follow = pb.createScanTo(F2start, Field_separators);
|
|
|
PabloAST * toDelete = pb.createIntrinsicCall(pablo::Intrinsic::ExclusiveSpan, {F1follow, F2follow});
|
|
|
PabloAST * toKeep = pb.createNot(toDelete);
|
|
|
pb.createAssign(pb.createExtract(getOutputStreamVar("toKeep"), pb.getInteger(0)), pb.createInFile(toKeep));
|
|
|
}
|
|
|
```
|
|
|
|
|
|
Of course, a slightly different kernel is needed for masking out a column other than the
|
|
|
second one. This should be written using a `columnNo` parameter to a more generic
|
|
|
kernel, and performing the necessary number of `ScanTo` and `Advance` operations.
|
|
|
The name of the kernel should actually be different for each columnNo.
|
|
|
```
|
|
|
MaskOutField::MaskOutField2(BuilderRef b, StreamSet * Record_separators,
|
|
|
StreamSet * Field_separators,
|
|
|
StreamSet * toKeep,
|
|
|
unsigned columnNo)
|
|
|
: PabloKernel(b, "MaskOutField" + std::to_string(columnNo),
|
|
|
{Binding{"Record_separators", Record_separators},
|
|
|
Binding{"Field_separators", Field_separators}},
|
|
|
{Binding{"toKeep", toKeep}}) {}
|
|
|
```
|
|
|
|
|
|
Finally, the first column must be handled differently. In this case, there is
|
|
|
no preceding comma, so the mask should zero out the following comma rather than the
|
|
|
preceding comma. |