|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <string.h> |
|
|
#include <fstream> |
|
|
#include <vector> |
|
|
#include <string> |
|
|
#include <iostream> |
|
|
#include <cstdlib> |
|
|
#include "InputFileStream.h" |
|
|
#include "OutputFileStream.h" |
|
|
#include "util/tokenize.hh" |
|
|
|
|
|
using namespace std; |
|
|
|
|
|
vector< string > splitLine(const char *line) |
|
|
{ |
|
|
vector< string > item; |
|
|
int start=0; |
|
|
int i=0; |
|
|
for(; line[i] != '\0'; i++) { |
|
|
if (line[i] == ' ' && |
|
|
line[i+1] == '|' && |
|
|
line[i+2] == '|' && |
|
|
line[i+3] == '|' && |
|
|
line[i+4] == ' ') { |
|
|
if (start > i) start = i; |
|
|
item.push_back( string( line+start, i-start ) ); |
|
|
start = i+5; |
|
|
i += 3; |
|
|
} |
|
|
} |
|
|
item.push_back( string( line+start, i-start ) ); |
|
|
|
|
|
return item; |
|
|
} |
|
|
|
|
|
bool getLine( istream &fileP, vector< string > &item ) |
|
|
{ |
|
|
if (fileP.eof()) |
|
|
return false; |
|
|
|
|
|
string line; |
|
|
if (getline(fileP, line)) { |
|
|
item = splitLine(line.c_str()); |
|
|
return true; |
|
|
} else { |
|
|
return false; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
int main(int argc, char* argv[]) |
|
|
{ |
|
|
cerr << "Starting..." << endl; |
|
|
|
|
|
char* &fileNameDirect = argv[1]; |
|
|
Moses::InputFileStream fileDirect(fileNameDirect); |
|
|
|
|
|
|
|
|
|
|
|
if (fileDirect.fail()) { |
|
|
cerr << "ERROR: could not open extract file " << fileNameDirect << endl; |
|
|
exit(1); |
|
|
} |
|
|
istream &fileDirectP = fileDirect; |
|
|
|
|
|
char* &fileNameConsolidated = argv[2]; |
|
|
ostream *fileConsolidated; |
|
|
|
|
|
if (strcmp(fileNameConsolidated, "-") == 0) { |
|
|
fileConsolidated = &cout; |
|
|
} else { |
|
|
Moses::OutputFileStream *outputFile = new Moses::OutputFileStream(); |
|
|
bool success = outputFile->Open(fileNameConsolidated); |
|
|
if (!success) { |
|
|
cerr << "ERROR: could not open file phrase table file " |
|
|
<< fileNameConsolidated << endl; |
|
|
exit(1); |
|
|
} |
|
|
fileConsolidated = outputFile; |
|
|
} |
|
|
|
|
|
int i=0; |
|
|
while(true) { |
|
|
i++; |
|
|
if (i%1000 == 0) cerr << "." << flush; |
|
|
if (i%10000 == 0) cerr << ":" << flush; |
|
|
if (i%100000 == 0) cerr << "!" << flush; |
|
|
|
|
|
vector< string > itemDirect; |
|
|
if (! getLine(fileDirectP, itemDirect )) |
|
|
break; |
|
|
|
|
|
const vector< string > count = util::tokenize( itemDirect[4] ); |
|
|
float countEF = atof(count[0].c_str()); |
|
|
float countF = atof(count[1].c_str()); |
|
|
float prob = countF/countEF; |
|
|
|
|
|
(*fileConsolidated) << itemDirect[0] << " ||| " |
|
|
<< itemDirect[1] << " ||| " |
|
|
<< prob << " ||| " |
|
|
<< itemDirect[2] << "||| " |
|
|
<< itemDirect[4] << " " << countEF |
|
|
<< " ||| " << endl; |
|
|
} |
|
|
|
|
|
fileConsolidated->flush(); |
|
|
if (fileConsolidated != &cout) { |
|
|
delete fileConsolidated; |
|
|
} |
|
|
|
|
|
cerr << "Finished" << endl; |
|
|
} |
|
|
|
|
|
|