|
|
#include "line_splitter.h" |
|
|
|
|
|
namespace probingpt |
|
|
{ |
|
|
|
|
|
line_text splitLine(const StringPiece &textin, bool scfg) |
|
|
{ |
|
|
const char delim[] = "|||"; |
|
|
line_text output; |
|
|
|
|
|
|
|
|
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim)); |
|
|
|
|
|
output.source_phrase = Trim(*it); |
|
|
|
|
|
|
|
|
|
|
|
it++; |
|
|
output.target_phrase = Trim(*it); |
|
|
|
|
|
|
|
|
if (scfg) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
it++; |
|
|
output.prob = Trim(*it); |
|
|
|
|
|
|
|
|
|
|
|
it++; |
|
|
if (it == util::TokenIter<util::MultiCharacter>::end()) return output; |
|
|
output.word_align = Trim(*it); |
|
|
|
|
|
|
|
|
|
|
|
it++; |
|
|
if (it == util::TokenIter<util::MultiCharacter>::end()) return output; |
|
|
output.counts = Trim(*it); |
|
|
|
|
|
|
|
|
|
|
|
it++; |
|
|
if (it == util::TokenIter<util::MultiCharacter>::end()) return output; |
|
|
output.sparse_score = Trim(*it); |
|
|
|
|
|
|
|
|
|
|
|
it++; |
|
|
if (it == util::TokenIter<util::MultiCharacter>::end()) return output; |
|
|
output.property = Trim(*it); |
|
|
|
|
|
|
|
|
return output; |
|
|
} |
|
|
|
|
|
std::vector<unsigned char> splitWordAll1(const StringPiece &textin) |
|
|
{ |
|
|
const char delim[] = " "; |
|
|
const char delim2[] = "-"; |
|
|
std::vector<unsigned char> output; |
|
|
|
|
|
|
|
|
if (textin.size() == 0) { |
|
|
return output; |
|
|
} |
|
|
|
|
|
|
|
|
util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim)); |
|
|
|
|
|
|
|
|
while (it) { |
|
|
|
|
|
util::TokenIter<util::MultiCharacter> itInner(*it, |
|
|
util::MultiCharacter(delim2)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output.push_back((unsigned char) (atoi(itInner->data()))); |
|
|
itInner++; |
|
|
output.push_back((unsigned char) (atoi(itInner->data()))); |
|
|
it++; |
|
|
} |
|
|
|
|
|
return output; |
|
|
|
|
|
} |
|
|
|
|
|
void reformatSCFG(line_text &output) |
|
|
{ |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|