| from tqdm import tqdm | |
| from .bpe import load_asm_tok | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser("Tokenize using existing tokenizer") | |
| parser.add_argument("-t", "--tokenizer", required=True, help="existing tokenizer") | |
| parser.add_argument("-i", "--input", required=True, help="input file") | |
| parser.add_argument("-o", "--output", required=True, help="output file") | |
| args = parser.parse_args() | |
| max_asm_toks = 0 | |
| asm_tok = load_asm_tok(args.tokenizer) | |
| with open(args.input, "r") as asmf, open(args.output, "w") as asmtokf: | |
| for asm in tqdm(asmf, desc=f"Tokenizing"): | |
| asm = asm.strip() | |
| asm_enc = asm_tok.encode(asm) | |
| max_asm_toks = max(max_asm_toks, len(asm_enc.tokens)) | |
| asm_seq = " ".join(asm_enc.tokens) | |
| asmtokf.write(asm_seq + "\n") | |
| print("Maximum tokens:", max_asm_toks) | |