Spaces:
Sleeping
Sleeping
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| from dataclasses import dataclass, field | |
| from fairseq.data.encoders import register_tokenizer | |
| from fairseq.dataclass import FairseqDataclass | |
| class MosesTokenizerConfig(FairseqDataclass): | |
| source_lang: str = field(default="en", metadata={"help": "source language"}) | |
| target_lang: str = field(default="en", metadata={"help": "target language"}) | |
| moses_no_dash_splits: bool = field( | |
| default=False, metadata={"help": "don't apply dash split rules"} | |
| ) | |
| moses_no_escape: bool = field( | |
| default=False, | |
| metadata={"help": "don't perform HTML escaping on apostrophe, quotes, etc."}, | |
| ) | |
| class MosesTokenizer(object): | |
| def __init__(self, cfg: MosesTokenizerConfig): | |
| self.cfg = cfg | |
| try: | |
| from sacremoses import MosesTokenizer, MosesDetokenizer | |
| self.tok = MosesTokenizer(cfg.source_lang) | |
| self.detok = MosesDetokenizer(cfg.target_lang) | |
| except ImportError: | |
| raise ImportError( | |
| "Please install Moses tokenizer with: pip install sacremoses" | |
| ) | |
| def encode(self, x: str) -> str: | |
| return self.tok.tokenize( | |
| x, | |
| aggressive_dash_splits=(not self.cfg.moses_no_dash_splits), | |
| return_str=True, | |
| escape=(not self.cfg.moses_no_escape), | |
| ) | |
| def decode(self, x: str) -> str: | |
| return self.detok.detokenize(x.split()) | |