|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <iostream> |
|
|
#include <set> |
|
|
|
|
|
#include <boost/lexical_cast.hpp> |
|
|
|
|
|
#include "util/double-conversion/double-conversion.h" |
|
|
#include "util/string_piece.hh" |
|
|
#include "util/tokenize_piece.hh" |
|
|
|
|
|
#include "Hypergraph.h" |
|
|
|
|
|
using namespace std; |
|
|
static const string kBOS = "<s>"; |
|
|
static const string kEOS = "</s>"; |
|
|
|
|
|
namespace MosesTuning |
|
|
{ |
|
|
|
|
|
StringPiece NextLine(util::FilePiece& from) |
|
|
{ |
|
|
StringPiece line; |
|
|
while ((line = from.ReadLine()).starts_with("#")); |
|
|
return line; |
|
|
} |
|
|
|
|
|
Vocab::Vocab() : eos_( FindOrAdd(kEOS)), bos_(FindOrAdd(kBOS)) |
|
|
{ |
|
|
} |
|
|
|
|
|
const Vocab::Entry &Vocab::FindOrAdd(const StringPiece &str) |
|
|
{ |
|
|
#if BOOST_VERSION >= 104200 |
|
|
Map::const_iterator i= map_.find(str, Hash(), Equals()); |
|
|
#else |
|
|
std::string copied_str(str.data(), str.size()); |
|
|
Map::const_iterator i = map_.find(copied_str.c_str()); |
|
|
#endif |
|
|
if (i != map_.end()) return *i; |
|
|
char *copied = static_cast<char*>(piece_backing_.Allocate(str.size() + 1)); |
|
|
memcpy(copied, str.data(), str.size()); |
|
|
copied[str.size()] = 0; |
|
|
return *map_.insert(Entry(copied, map_.size())).first; |
|
|
} |
|
|
|
|
|
double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static pair<Edge*,size_t> ReadEdge(util::FilePiece &from, Graph &graph) |
|
|
{ |
|
|
Edge* edge = graph.NewEdge(); |
|
|
StringPiece line = from.ReadLine(); |
|
|
util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter(" ||| ")); |
|
|
|
|
|
for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) { |
|
|
StringPiece got = *i; |
|
|
if ('[' == *got.data() && ']' == got.data()[got.size() - 1]) { |
|
|
|
|
|
char *end_ptr; |
|
|
unsigned long int child = std::strtoul(got.data() + 1, &end_ptr, 10); |
|
|
UTIL_THROW_IF(end_ptr != got.data() + got.size() - 1, HypergraphException, "Bad non-terminal" << got); |
|
|
UTIL_THROW_IF(child >= graph.VertexSize(), HypergraphException, "Reference to vertex " << child << " but we only have " << graph.VertexSize() << " vertices. Is the file in bottom-up format?"); |
|
|
edge->AddWord(NULL); |
|
|
edge->AddChild(child); |
|
|
} else { |
|
|
const Vocab::Entry &found = graph.MutableVocab().FindOrAdd(got); |
|
|
edge->AddWord(&found); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
++pipes; |
|
|
for (util::TokenIter<util::SingleCharacter, true> i(*pipes, util::SingleCharacter(' ')); i; ++i) { |
|
|
StringPiece fv = *i; |
|
|
if (!fv.size()) break; |
|
|
size_t equals = fv.find_last_of("="); |
|
|
UTIL_THROW_IF(equals == fv.npos, HypergraphException, "Failed to parse feature '" << fv << "'"); |
|
|
StringPiece name = fv.substr(0,equals); |
|
|
StringPiece value = fv.substr(equals+1); |
|
|
int processed; |
|
|
float score = converter.StringToFloat(value.data(), value.length(), &processed); |
|
|
UTIL_THROW_IF(isnan(score), HypergraphException, "Failed to parse weight '" << value << "'"); |
|
|
edge->AddFeature(name,score); |
|
|
} |
|
|
|
|
|
++pipes; |
|
|
size_t sourceCovered = boost::lexical_cast<size_t>(*pipes); |
|
|
return pair<Edge*,size_t>(edge,sourceCovered); |
|
|
} |
|
|
|
|
|
void Graph::Prune(Graph* pNewGraph, const SparseVector& weights, size_t minEdgeCount) const |
|
|
{ |
|
|
|
|
|
Graph& newGraph = *pNewGraph; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
map<const Edge*, FeatureStatsType> edgeBackwardScores; |
|
|
map<const Edge*, size_t> edgeHeads; |
|
|
vector<FeatureStatsType> vertexBackwardScores(vertices_.Size(), kMinScore); |
|
|
vector<vector<const Edge*> > outgoing(vertices_.Size()); |
|
|
|
|
|
|
|
|
for (size_t vi = 0; vi < vertices_.Size(); ++vi) { |
|
|
|
|
|
const Vertex& vertex = vertices_[vi]; |
|
|
const vector<const Edge*>& incoming = vertex.GetIncoming(); |
|
|
if (!incoming.size()) { |
|
|
vertexBackwardScores[vi] = 0; |
|
|
} else { |
|
|
for (size_t ei = 0; ei < incoming.size(); ++ei) { |
|
|
|
|
|
edgeHeads[incoming[ei]]= vi; |
|
|
FeatureStatsType incomingScore = incoming[ei]->GetScore(weights); |
|
|
for (size_t i = 0; i < incoming[ei]->Children().size(); ++i) { |
|
|
|
|
|
size_t childId = incoming[ei]->Children()[i]; |
|
|
UTIL_THROW_IF(vertexBackwardScores[childId] == kMinScore, |
|
|
HypergraphException, "Graph was not topologically sorted. curr=" << vi << " prev=" << childId); |
|
|
outgoing[childId].push_back(incoming[ei]); |
|
|
incomingScore += vertexBackwardScores[childId]; |
|
|
} |
|
|
edgeBackwardScores[incoming[ei]]= incomingScore; |
|
|
|
|
|
if (incomingScore > vertexBackwardScores[vi]) vertexBackwardScores[vi] = incomingScore; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
vector<FeatureStatsType> vertexForwardScores(vertices_.Size(), kMinScore); |
|
|
map<const Edge*, FeatureStatsType> edgeForwardScores; |
|
|
for (size_t i = 1; i <= vertices_.Size(); ++i) { |
|
|
size_t vi = vertices_.Size() - i; |
|
|
|
|
|
if (!outgoing[vi].size()) { |
|
|
vertexForwardScores[vi] = 0; |
|
|
} else { |
|
|
for (size_t ei = 0; ei < outgoing[vi].size(); ++ei) { |
|
|
|
|
|
FeatureStatsType outgoingScore = 0; |
|
|
|
|
|
outgoingScore += vertexForwardScores[edgeHeads[outgoing[vi][ei]]]; |
|
|
|
|
|
edgeForwardScores[outgoing[vi][ei]] = outgoingScore; |
|
|
|
|
|
for (size_t i = 0; i < outgoing[vi][ei]->Children().size(); ++i) { |
|
|
size_t siblingId = outgoing[vi][ei]->Children()[i]; |
|
|
if (siblingId != vi) { |
|
|
|
|
|
outgoingScore += vertexBackwardScores[siblingId]; |
|
|
} |
|
|
} |
|
|
outgoingScore += outgoing[vi][ei]->GetScore(weights); |
|
|
if (outgoingScore > vertexForwardScores[vi]) vertexForwardScores[vi] = outgoingScore; |
|
|
|
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
multimap<FeatureStatsType, const Edge*> edgeScores; |
|
|
for (size_t i = 0; i < edges_.Size(); ++i) { |
|
|
const Edge* edge = &(edges_[i]); |
|
|
if (edgeForwardScores.find(edge) == edgeForwardScores.end()) { |
|
|
|
|
|
|
|
|
edgeForwardScores[edge] = vertexForwardScores[edgeHeads[edge]]; |
|
|
} |
|
|
FeatureStatsType score = edgeForwardScores[edge] + edgeBackwardScores[edge]; |
|
|
edgeScores.insert(pair<FeatureStatsType, const Edge*>(score,edge)); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
multimap<FeatureStatsType, const Edge*>::const_reverse_iterator ei = edgeScores.rbegin(); |
|
|
size_t edgeCount = 1; |
|
|
while(edgeCount < minEdgeCount && ei != edgeScores.rend()) { |
|
|
++ei; |
|
|
++edgeCount; |
|
|
} |
|
|
multimap<FeatureStatsType, const Edge*>::const_iterator lowest = edgeScores.begin(); |
|
|
if (ei != edgeScores.rend()) lowest = edgeScores.lower_bound(ei->first); |
|
|
|
|
|
|
|
|
set<size_t> retainedVertices; |
|
|
set<const Edge*> retainedEdges; |
|
|
for (; lowest != edgeScores.end(); ++lowest) { |
|
|
|
|
|
retainedEdges.insert(lowest->second); |
|
|
retainedVertices.insert(edgeHeads[lowest->second]); |
|
|
for (size_t i = 0; i < lowest->second->Children().size(); ++i) { |
|
|
retainedVertices.insert(lowest->second->Children()[i]); |
|
|
} |
|
|
} |
|
|
newGraph.SetCounts(retainedVertices.size(), retainedEdges.size()); |
|
|
|
|
|
|
|
|
map<size_t,size_t> oldIdToNew; |
|
|
size_t vi = 0; |
|
|
for (set<size_t>::const_iterator i = retainedVertices.begin(); i != retainedVertices.end(); ++i, ++vi) { |
|
|
|
|
|
oldIdToNew[*i] = vi; |
|
|
Vertex* vertex = newGraph.NewVertex(); |
|
|
vertex->SetSourceCovered(vertices_[*i].SourceCovered()); |
|
|
} |
|
|
|
|
|
for (set<const Edge*>::const_iterator i = retainedEdges.begin(); i != retainedEdges.end(); ++i) { |
|
|
Edge* newEdge = newGraph.NewEdge(); |
|
|
const Edge* oldEdge = *i; |
|
|
for (size_t j = 0; j < oldEdge->Words().size(); ++j) { |
|
|
newEdge->AddWord(oldEdge->Words()[j]); |
|
|
} |
|
|
for (size_t j = 0; j < oldEdge->Children().size(); ++j) { |
|
|
newEdge->AddChild(oldIdToNew[oldEdge->Children()[j]]); |
|
|
} |
|
|
newEdge->SetFeatures(oldEdge->Features()); |
|
|
Vertex& newHead = newGraph.vertices_[oldIdToNew[edgeHeads[oldEdge]]]; |
|
|
newHead.AddEdge(newEdge); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void ReadGraph(util::FilePiece &from, Graph &graph) |
|
|
{ |
|
|
|
|
|
|
|
|
StringPiece line = from.ReadLine(); |
|
|
UTIL_THROW_IF(line.compare("# target ||| features ||| source-covered") != 0, HypergraphException, "Incorrect format spec on first line: '" << line << "'"); |
|
|
line = NextLine(from); |
|
|
|
|
|
|
|
|
util::TokenIter<util::SingleCharacter, false> i(line, util::SingleCharacter(' ')); |
|
|
unsigned long int vertices = boost::lexical_cast<unsigned long int>(*i); |
|
|
++i; |
|
|
unsigned long int edges = boost::lexical_cast<unsigned long int>(*i); |
|
|
graph.SetCounts(vertices, edges); |
|
|
|
|
|
for (size_t i = 0; i < vertices; ++i) { |
|
|
line = NextLine(from); |
|
|
unsigned long int edge_count = boost::lexical_cast<unsigned long int>(line); |
|
|
Vertex* vertex = graph.NewVertex(); |
|
|
for (unsigned long int e = 0; e < edge_count; ++e) { |
|
|
pair<Edge*,size_t> edge = ReadEdge(from, graph); |
|
|
vertex->AddEdge(edge.first); |
|
|
|
|
|
|
|
|
if (!e) { |
|
|
vertex->SetSourceCovered(edge.second); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
}; |
|
|
|