| | import argparse |
| | from collections import namedtuple |
| | import os |
| |
|
| | DATADIR = "/path/to/train_data" |
| | DEDUP_FROM_DIR = "/path/to/eval/data" |
| | OUTPUT_DIR = "/path/to/output/data" |
| |
|
| |
|
| | def main(args): |
| | languages = set() |
| | for language_directory in os.listdir(DATADIR): |
| | if "_" in language_directory: |
| | src, tgt = language_directory.split("_") |
| | languages.add(LanguagePair(src=src, tgt=tgt)) |
| |
|
| | data = existing_data() |
| | train_languages = sorted(languages) |
| | for language_pair in train_languages[args.start_index:args.start_index + args.size]: |
| | print(language_pair) |
| | dedup(language_pair, data) |
| |
|
| |
|
| | LanguagePair = namedtuple("LanguagePair", ["src", "tgt"]) |
| |
|
| |
|
| | def existing_data(): |
| | data = set() |
| | for file in os.listdir(DEDUP_FROM_DIR): |
| | with open(os.path.join(DEDUP_FROM_DIR, file)) as f: |
| | data |= set(f.readlines()) |
| | return data |
| | |
| | def dedup(language_pair, data, verbose=True, output=True): |
| | train_filenames = LanguagePair( |
| | src=f"{DATADIR}/{language_pair.src}_{language_pair.tgt}/train.{language_pair.src}", |
| | tgt=f"{DATADIR}/{language_pair.src}_{language_pair.tgt}/train.{language_pair.tgt}", |
| | ) |
| |
|
| | output_filenames = LanguagePair( |
| | src=f"{OUTPUT_DIR}/train.dedup.{language_pair.src}-{language_pair.tgt}.{language_pair.src}", |
| | tgt=f"{OUTPUT_DIR}/train.dedup.{language_pair.src}-{language_pair.tgt}.{language_pair.tgt}" |
| | ) |
| |
|
| | |
| | if (os.path.exists(output_filenames.src) and |
| | os.path.exists(output_filenames.tgt)): |
| | if verbose: |
| | print(f"{language_pair.src}-{language_pair.tgt} already done.") |
| | return |
| |
|
| | if verbose: |
| | print(f"{language_pair.src}-{language_pair.tgt} ready, will check dups.") |
| |
|
| | |
| | if not output: |
| | return |
| |
|
| | if os.path.exists(train_filenames.src) and os.path.exists(train_filenames.tgt): |
| | with open(train_filenames.src) as f: |
| | train_source = f.readlines() |
| |
|
| | with open(train_filenames.tgt) as f: |
| | train_target = f.readlines() |
| |
|
| | |
| | new_train_source = [] |
| | new_train_target = [] |
| | for i, train_line in enumerate(train_source): |
| | if train_line not in data and train_target[i] not in data: |
| | new_train_source.append(train_line) |
| | new_train_target.append(train_target[i]) |
| |
|
| | assert len(train_source) == len(train_target) |
| | assert len(new_train_source) == len(new_train_target) |
| | assert len(new_train_source) <= len(train_source) |
| |
|
| | with open(output_filenames.src, "w") as o: |
| | for line in new_train_source: |
| | o.write(line) |
| |
|
| | with open(output_filenames.tgt, "w") as o: |
| | for line in new_train_target: |
| | o.write(line) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("-s", "--start-index", required=True, type=int) |
| | parser.add_argument("-n", "--size", required=True, type=int) |
| | main(parser.parse_args()) |
| |
|