Spaces:
Runtime error
Runtime error
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # | |
| # This source code is licensed under the MIT license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| import os | |
| from collections import Counter | |
| import torch | |
| from fairseq.file_io import PathManager | |
| from fairseq.tokenizer import tokenize_line | |
| from typing import List, Dict | |
| def safe_readline(f): | |
| pos = f.tell() | |
| while True: | |
| try: | |
| return f.readline() | |
| except UnicodeDecodeError: | |
| pos -= 1 | |
| f.seek(pos) # search where this character begins | |
| class Binarizer: | |
| def binarize( | |
| filename, | |
| dict, | |
| consumer, | |
| tokenize=tokenize_line, | |
| append_eos=True, | |
| reverse_order=False, | |
| offset=0, | |
| end=-1, | |
| already_numberized=False, | |
| ) -> Dict[str, int]: | |
| nseq, ntok = 0, 0 | |
| replaced = Counter() | |
| def replaced_consumer(word, idx): | |
| if idx == dict.unk_index and word != dict.unk_word: | |
| replaced.update([word]) | |
| with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f: | |
| f.seek(offset) | |
| # next(f) breaks f.tell(), hence readline() must be used | |
| line = safe_readline(f) | |
| while line: | |
| # f.tell() does not always give the byte position in the file | |
| # sometimes it skips to a very large number | |
| # it is unlikely that through a normal read we go from | |
| # end bytes to end + 2**32 bytes (4 GB) and this makes it unlikely | |
| # that the procedure breaks by the undeterministic behavior of | |
| # f.tell() | |
| if end > 0 and f.tell() > end and f.tell() < end + 2 ** 32: | |
| break | |
| if already_numberized: | |
| id_strings = line.strip().split() | |
| id_list = [int(id_string) for id_string in id_strings] | |
| if reverse_order: | |
| id_list.reverse() | |
| if append_eos: | |
| id_list.append(dict.eos()) | |
| ids = torch.IntTensor(id_list) | |
| else: | |
| ids = dict.encode_line( | |
| line=line, | |
| line_tokenizer=tokenize, | |
| add_if_not_exist=False, | |
| consumer=replaced_consumer, | |
| append_eos=append_eos, | |
| reverse_order=reverse_order, | |
| ) | |
| nseq += 1 | |
| ntok += len(ids) | |
| consumer(ids) | |
| line = f.readline() | |
| return { | |
| "nseq": nseq, | |
| "nunk": sum(replaced.values()), | |
| "ntok": ntok, | |
| "replaced": replaced, | |
| } | |
| def binarize_alignments( | |
| filename, alignment_parser, consumer, offset=0, end=-1 | |
| ) -> Dict[str, int]: | |
| nseq = 0 | |
| with open(PathManager.get_local_path(filename), "r") as f: | |
| f.seek(offset) | |
| line = safe_readline(f) | |
| while line: | |
| if end > 0 and f.tell() > end: | |
| break | |
| ids = alignment_parser(line) | |
| nseq += 1 | |
| consumer(ids) | |
| line = f.readline() | |
| return {"nseq": nseq} | |
| def find_offsets(filename, num_chunks) -> List[int]: | |
| with open(PathManager.get_local_path(filename), "r", encoding="utf-8") as f: | |
| size = os.fstat(f.fileno()).st_size | |
| chunk_size = size // num_chunks | |
| offsets = [0 for _ in range(num_chunks + 1)] | |
| for i in range(1, num_chunks): | |
| f.seek(chunk_size * i) | |
| safe_readline(f) | |
| offsets[i] = f.tell() | |
| return offsets | |