| |
| |
| #include <sstream> |
| #include <cstring> |
| #include <algorithm> |
| #include <iostream> |
| #include <stdexcept> |
|
|
| #include <boost/pool/pool_alloc.hpp> |
|
|
| #include "tpt_tokenindex.h" |
| #include "ug_typedefs.h" |
|
|
| using namespace std; |
| namespace sapt |
| { |
|
|
| TokenIndex:: |
| TokenIndex(string unkToken) |
| : ridx(0), unkLabel(unkToken), unkId(1), numTokens(0) |
| , startIdx(0), endIdx(0) |
| { |
| lock.reset(new boost::mutex()); |
| }; |
|
|
| #if 0 |
| TokenIndex:: |
| TokenIndex(string fname, string unkToken,bool dyna) |
| : ridx(0),unkLabel(unkToken) |
| { |
| this->open(fname,unkToken,dyna); |
| }; |
| #endif |
|
|
| void |
| TokenIndex:: |
| open(string fname, string unkToken,bool dyna) |
| { |
| if (access(fname.c_str(),F_OK)) |
| { |
| ostringstream msg; |
| msg << "TokenIndex::open: File '" << fname << "' does not exist."; |
| throw std::runtime_error(msg.str().c_str()); |
| } |
|
|
| file.open(fname); |
| if (!file.is_open()) |
| { |
| ostringstream msg; |
| msg << "TokenIndex::open: Error opening file '" << fname << "'."; |
| throw std::runtime_error(msg.str().c_str()); |
| } |
|
|
| this->numTokens = *(reinterpret_cast<uint32_t const*>(file.data())); |
| unkId = *(reinterpret_cast<id_type const*>(file.data()+4)); |
|
|
| startIdx = reinterpret_cast<Entry const*>(file.data()+4+sizeof(id_type)); |
| endIdx = startIdx + numTokens; |
| comp.base = reinterpret_cast<char const*>(endIdx); |
| if (!unkToken.empty()) |
| { |
| Entry const* bla = lower_bound(startIdx,endIdx,unkToken.c_str(),comp); |
| unkId = ((bla < endIdx && unkToken == comp.base+bla->offset) |
| ? bla->id |
| : numTokens); |
| } |
| this->dynamic=dyna; |
| if (dyna) |
| { |
| this->str2idExtra.reset(new map<string,id_type>()); |
| this->newWords.reset(new vector<string>()); |
| } |
| } |
|
|
| void |
| TokenIndex:: |
| close() |
| { |
| file.close(); |
| } |
|
|
| TokenIndex:: |
| CompFunc:: |
| CompFunc() |
| {}; |
|
|
| bool |
| TokenIndex:: |
| CompFunc:: |
| operator()(Entry const& A, char const* w) |
| { |
| return strcmp(base+A.offset,w) < 0; |
| }; |
|
|
| id_type |
| TokenIndex:: |
| operator[](char const* p) const |
| { |
| if (startIdx != endIdx) |
| { |
| Entry const* bla = lower_bound(startIdx,endIdx,p,comp); |
| if (bla != endIdx && !strcmp(comp.base+bla->offset,p)) |
| return bla->id; |
| if (!dynamic) return unkId; |
| } |
| else if (!dynamic) return strcmp(p,"NULL") && unkId; |
| |
| boost::lock_guard<boost::mutex> lk(*this->lock); |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| map<string,id_type>::value_type newItem(p,str2idExtra->size()+numTokens); |
| pair<map<string,id_type>::iterator,bool> foo = str2idExtra->insert(newItem); |
| if (foo.second) |
| newWords->push_back(foo.first->first); |
| return foo.first->second; |
| } |
|
|
| id_type |
| TokenIndex:: |
| operator[](string const& w) const |
| { |
| return (*this)[w.c_str()]; |
| } |
|
|
| vector<char const*> |
| TokenIndex:: |
| reverseIndex() const |
| { |
| size_t numToks = endIdx-startIdx; |
|
|
| |
|
|
| vector<char const*> v(numToks,NULL); |
| |
| for (Entry const* x = startIdx; x != endIdx; x++) |
| { |
| if (x->id >= v.size()) |
| v.resize(x->id+1); |
| v[x->id] = comp.base+x->offset; |
| } |
| |
| return v; |
| } |
|
|
| char const* const |
| TokenIndex:: |
| operator[](id_type id) const |
| { |
| if (!ridx.size()) |
| { |
| boost::lock_guard<boost::mutex> lk(*this->lock); |
| |
| |
| if (!ridx.size()) ridx = reverseIndex(); |
| } |
| if (id < ridx.size()) |
| return ridx[id]; |
| |
| boost::lock_guard<boost::mutex> lk(*this->lock); |
| if (dynamic && id < ridx.size()+newWords->size()) |
| return (*newWords)[id-ridx.size()].c_str(); |
| return unkLabel.c_str(); |
| } |
|
|
| void |
| TokenIndex:: |
| iniReverseIndex() |
| { |
| if (!ridx.size()) |
| { |
| boost::lock_guard<boost::mutex> lk(*this->lock); |
| if (!ridx.size()) ridx = reverseIndex(); |
| } |
| } |
|
|
|
|
| char const* const |
| TokenIndex:: |
| operator[](id_type id) |
| { |
| if (!ridx.size()) |
| { |
| boost::lock_guard<boost::mutex> lk(*this->lock); |
| if (!ridx.size()) ridx = reverseIndex(); |
| } |
| if (id < ridx.size()) |
| return ridx[id]; |
| boost::lock_guard<boost::mutex> lk(*this->lock); |
| if (dynamic && id < ridx.size()+newWords->size()) |
| return (*newWords)[id-ridx.size()].c_str(); |
| return unkLabel.c_str(); |
| } |
|
|
| string |
| TokenIndex:: |
| toString(vector<id_type> const& v) |
| { |
| if (!ridx.size()) |
| { |
| boost::lock_guard<boost::mutex> lk(*this->lock); |
| if (!ridx.size()) ridx = reverseIndex(); |
| } |
| ostringstream buf; |
| for (size_t i = 0; i < v.size(); i++) |
| buf << (i ? " " : "") << (*this)[v[i]]; |
| return buf.str(); |
| } |
|
|
| string |
| TokenIndex:: |
| toString(vector<id_type> const& v) const |
| { |
| if (!ridx.size()) |
| { |
| boost::lock_guard<boost::mutex> lk(*this->lock); |
| if (!ridx.size()) ridx = reverseIndex(); |
| } |
| ostringstream buf; |
| for (size_t i = 0; i < v.size(); i++) |
| buf << (i ? " " : "") << (*this)[v[i]]; |
| return buf.str(); |
| } |
|
|
| string |
| TokenIndex:: |
| toString(id_type const* start, id_type const* const stop) |
| { |
| if (!ridx.size()) |
| { |
| boost::lock_guard<boost::mutex> lk(*this->lock); |
| if (!ridx.size()) ridx = reverseIndex(); |
| } |
| ostringstream buf; |
| if (start < stop) |
| buf << (*this)[*start]; |
| while (++start < stop) |
| buf << " " << (*this)[*start]; |
| return buf.str(); |
| } |
|
|
| string |
| TokenIndex:: |
| toString(id_type const* start, id_type const* const stop) const |
| { |
| if (!ridx.size()) |
| { |
| boost::lock_guard<boost::mutex> lk(*this->lock); |
| if (!ridx.size()) ridx = reverseIndex(); |
| } |
| ostringstream buf; |
| if (start < stop) |
| buf << (*this)[*start]; |
| while (++start < stop) |
| buf << " " << (*this)[*start]; |
| return buf.str(); |
| } |
|
|
| vector<id_type> |
| TokenIndex:: |
| toIdSeq(string const& line) const |
| { |
| istringstream buf(line); |
| string w; |
| vector<id_type> retval; |
| while (buf>>w) |
| retval.push_back((*this)[w]); |
| return retval; |
| } |
|
|
| |
| bool |
| TokenIndex:: |
| fillIdSeq(string const& line, vector<id_type> & v) const |
| { |
| bool allgood = true; string w; |
| v.clear(); |
| for (istringstream buf(line); buf>>w;) |
| { |
| v.push_back((*this)[w]); |
| allgood = allgood && v.back() > 1; |
| } |
| return allgood; |
| } |
|
|
| id_type |
| TokenIndex:: |
| getNumTokens() const |
| { |
| return numTokens; |
| } |
|
|
| id_type |
| TokenIndex:: |
| getUnkId() const |
| { |
| return unkId; |
| } |
|
|
| char const* const |
| TokenIndex:: |
| getUnkToken() const |
| { |
| return unkLabel.c_str(); |
| |
| } |
|
|
| id_type |
| TokenIndex:: |
| knownVocabSize() const |
| { |
| return numTokens; |
| } |
|
|
| id_type |
| TokenIndex:: |
| ksize() const |
| { |
| return numTokens; |
| } |
|
|
| id_type |
| TokenIndex:: |
| totalVocabSize() const |
| { return tsize(); } |
|
|
| id_type |
| TokenIndex:: |
| tsize() const |
| { |
| return (newWords != NULL |
| ? numTokens+newWords->size() |
| : numTokens); |
| } |
|
|
| void |
| write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok, |
| string const& ofile, string const& unkToken) |
| { |
| typedef pair<uint32_t,id_type> IndexEntry; |
|
|
| |
| vector<IndexEntry> index(tok.size()); |
| ostringstream data; |
| id_type unkId = tok.size(); |
| for (size_t i = 0; i < tok.size(); i++) |
| { |
| if (tok[i].first == unkToken) |
| unkId = tok[i].second; |
| index[i].first = data.tellp(); |
| index[i].second = tok[i].second; |
| data<<tok[i].first<<char(0); |
| } |
|
|
| |
| ofstream out(ofile.c_str()); |
| uint32_t vsize = index.size(); |
| out.write(reinterpret_cast<char*>(&vsize),4); |
| out.write(reinterpret_cast<char*>(&unkId),sizeof(id_type)); |
| for (size_t i = 0; i < index.size(); i++) |
| { |
| out.write(reinterpret_cast<char*>(&index[i].first),4); |
| out.write(reinterpret_cast<char*>(&index[i].second),sizeof(id_type)); |
| } |
| out<<data.str(); |
| } |
|
|
| void |
| TokenIndex:: |
| write(string fname) |
| { |
| typedef pair<string,uint32_t> Token; |
| vector<Token> tok(totalVocabSize()); |
| for (id_type i = 0; i < tok.size(); ++i) |
| tok[i] = Token((*this)[i],i); |
| sort(tok.begin(),tok.end()); |
| write_tokenindex_to_disk(tok,fname,unkLabel); |
| } |
|
|
| bool |
| TokenIndex:: |
| isDynamic() const |
| { |
| return dynamic; |
| } |
|
|
| bool |
| TokenIndex:: |
| setDynamic(bool on) |
| { |
| bool ret = dynamic; |
| if (on && this->str2idExtra == NULL) |
| { |
| this->str2idExtra.reset(new map<string,id_type>()); |
| this->newWords.reset(new vector<string>()); |
| } |
| dynamic = on; |
| if (on) |
| { |
| (*this)["NULL"]; |
| (*this)[unkLabel]; |
| } |
| return ret; |
| } |
|
|
| void |
| TokenIndex:: |
| setUnkLabel(string unk) |
| { |
| unkId = (*this)[unk]; |
| unkLabel = unk; |
| } |
|
|
| } |
|
|