| import sys | |
| import argparse | |
| from tqdm import tqdm | |
| from build_emov_translation_manifests import dedup, remove_under_k | |
| if __name__ == "__main__": | |
| """ | |
| this is a standalone script to process a km file | |
| specifically, to dedup or remove tokens that repeat less | |
| than k times in a row | |
| """ | |
| parser = argparse.ArgumentParser(description="") | |
| parser.add_argument("km", type=str, help="path to km file") | |
| parser.add_argument("--dedup", action='store_true') | |
| parser.add_argument("--remove-under-k", type=int, default=0) | |
| parser.add_argument("--output", default=None) | |
| args = parser.parse_args() | |
| if not args.dedup and args.remove_under_k == 0: | |
| print("nothing to do! quitting...") | |
| sys.exit(0) | |
| km = open(args.km, "r").readlines() | |
| out = [] | |
| for line in tqdm(km): | |
| if args.remove_under_k > 0: | |
| line = remove_under_k(line, args.remove_under_k) | |
| if args.dedup: | |
| line = dedup(line) | |
| out.append(line) | |
| path = args.km if args.output is None else args.output | |
| if args.remove_under_k > 0: | |
| path = path.replace(".km", f"-k{args.remove_under_k}.km") | |
| if args.dedup: | |
| path = path.replace(".km", f"-deduped.km") | |
| open(path, "w").writelines(out) | |
| print(f"written to {path}") | |