| #include <string> |
| #include <iostream> |
| #include <cstdlib> |
| #include <fstream> |
| #include <sstream> |
| #include <unordered_map> |
| #include <set> |
| #include <vector> |
| #include <iterator> |
| #include <stdexcept> |
|
|
| #include <re2/re2.h> |
| #include <unistd.h> |
|
|
| #include "Parameters.h" |
|
|
| #ifdef TOKENIZER_NAMESPACE |
| namespace TOKENIZER_NAMESPACE { |
| #endif |
|
|
| |
| |
| |
| |
| class Tokenizer { |
|
|
| private: |
|
|
| typedef enum { |
| empty = 0, |
| blank, |
| upper, |
| letta, |
| numba, |
| hyphn, |
| stops, |
| quote, |
| pinit, |
| pfini, |
| pfpct, |
| marks, |
| limit |
| } charclass_t; |
|
|
| std::size_t nthreads; |
| std::size_t chunksize; |
| std::string cfg_dir; |
|
|
| |
| std::set<std::string> nbpre_num_set; |
| |
| std::set<std::string> nbpre_gen_set; |
|
|
| |
| std::set<std::wstring> nbpre_num_ucs4; |
| |
| std::set<std::wstring> nbpre_gen_ucs4; |
|
|
| |
| std::vector<re2::RE2 *> prot_pat_vec; |
|
|
| protected: |
|
|
| |
| std::string lang_iso; |
| bool english_p; |
| bool latin_p; |
| bool skip_xml_p; |
| bool skip_alltags_p; |
| bool entities_p; |
| bool escape_p; |
| bool unescape_p; |
| bool aggressive_hyphen_p; |
| bool supersub_p; |
| bool url_p; |
| bool downcase_p; |
| bool normalize_p; |
| bool penn_p; |
| bool narrow_latin_p; |
| bool narrow_kana_p; |
| bool refined_p; |
| bool drop_bad_p; |
| bool splits_p; |
| bool verbose_p; |
| bool para_marks_p; |
| bool split_breaks_p; |
|
|
| |
| std::pair<int,int> load_prefixes(std::ifstream& ifs); |
|
|
| |
| void protected_tokenize(std::string& inplace); |
|
|
| |
| struct VectorTokenizerCallable { |
| Tokenizer *tokenizer; |
| std::vector<std::string>& in; |
| std::vector<std::string>& out; |
|
|
| VectorTokenizerCallable(Tokenizer *_tokenizer, |
| std::vector<std::string>& _in, |
| std::vector<std::string>& _out) |
| : tokenizer(_tokenizer) |
| , in(_in) |
| , out(_out) { |
| }; |
|
|
| void operator()() { |
| out.resize(in.size()); |
| for (std::size_t ii = 0; ii < in.size(); ++ii) |
| if (in[ii].empty()) |
| out[ii] = in[ii]; |
| else if (tokenizer->penn_p) |
| out[ii] = tokenizer->penn_tokenize(in[ii]); |
| else |
| out[ii] = tokenizer->quik_tokenize(in[ii]); |
| }; |
| }; |
|
|
| public: |
|
|
| Tokenizer(); |
|
|
| |
| Tokenizer(const Parameters& _params); |
|
|
| |
| ~Tokenizer(); |
|
|
| |
| void init(const char *cfg_dir_path = 0); |
|
|
| void set_config_dir(const std::string& _cfg_dir); |
|
|
| |
| void reset(); |
|
|
| |
| bool splitting() const { return splits_p; } |
|
|
| |
| bool escape(std::string& inplace); |
|
|
| |
| |
| |
| bool unescape(std::string& inplace); |
|
|
| |
| std::size_t tokenize(std::istream& is, std::ostream& os); |
|
|
| |
| std::string quik_tokenize(const std::string& buf); |
|
|
| |
| std::string penn_tokenize(const std::string& buf); |
|
|
| |
| std::string tokenize(const std::string& buf) { |
| return penn_p ? penn_tokenize(buf) : quik_tokenize(buf); |
| } |
|
|
| |
| void tokenize(const std::string& buf, std::string& outs) { |
| outs = tokenize(buf); |
| } |
|
|
| |
| std::vector<std::string> tokens(const std::string& in) { |
| std::istringstream tokss(penn_p ? penn_tokenize(in) : tokenize(in)); |
| std::vector<std::string> outv; |
| std::copy(std::istream_iterator<std::string>(tokss), |
| std::istream_iterator<std::string>(), |
| std::back_inserter(outv)); |
| return outv; |
| } |
|
|
| |
| std::size_t detokenize(std::istream& is, std::ostream &os); |
|
|
| |
| std::string detokenize(const std::string& buf); |
|
|
| void detokenize(const std::string& buf, std::string& outs) { |
| outs = detokenize(buf); |
| } |
|
|
| |
| std::string detokenize(const std::vector<std::string>& inv) { |
| std::ostringstream oss; |
| std::copy(inv.begin(), inv.end(), std::ostream_iterator<std::string>(oss," ")); |
| return detokenize(oss.str()); |
| } |
|
|
| |
| std::vector<std::string> splitter(const std::string &istr,bool *continuation_p = 0); |
|
|
| |
| std::pair<std::size_t,std::size_t> splitter(std::istream& is, std::ostream& os); |
|
|
| }; |
|
|
| #ifdef TOKENIZER_NAMESPACE |
| }; |
| #endif |
|
|