Spaces:
Build error
Build error
| from tqdm import tqdm | |
| import os | |
| from collections import defaultdict | |
| def read_file(fname): | |
| with open(fname, "r", encoding="utf-8") as infile: | |
| for line in infile: | |
| yield line.strip() | |
| def extract_non_english_pairs(indir, outdir, LANGS): | |
| """ | |
| Extracts non-english pair parallel corpora | |
| indir: contains english centric data in the following form: | |
| - directory named en-xx for language xx | |
| - each directory contains a train.en and train.xx | |
| outdir: output directory to store mined data for each pair. | |
| One directory is created for each pair. | |
| LANGS: list of languages in the corpus (other than English). | |
| The language codes must correspond to the ones used in the | |
| files and directories in indir. Prefarably, sort the languages | |
| in this list in alphabetic order. outdir will contain data for xx-yy, | |
| but not for yy-xx, so it will be convenient to have this list in sorted order. | |
| """ | |
| for i in tqdm(range(len(LANGS) - 1)): | |
| print() | |
| for j in range(i + 1, len(LANGS)): | |
| lang1 = LANGS[i] | |
| lang2 = LANGS[j] | |
| # print() | |
| print("{} {}".format(lang1, lang2)) | |
| fname1 = "{}/en-{}/train.en".format(indir, lang1) | |
| fname2 = "{}/en-{}/train.en".format(indir, lang2) | |
| # print(fname1) | |
| # print(fname2) | |
| enset_l1 = set(read_file(fname1)) | |
| common_en_set = enset_l1.intersection(read_file(fname2)) | |
| ## this block should be used if you want to consider multiple translations. | |
| # il_fname1 = "{}/en-{}/train.{}".format(indir, lang1, lang1) | |
| # en_lang1_dict = defaultdict(list) | |
| # for en_line, il_line in zip(read_file(fname1), read_file(il_fname1)): | |
| # if en_line in common_en_set: | |
| # en_lang1_dict[en_line].append(il_line) | |
| # # this block should be used if you DONT to consider multiple translation. | |
| il_fname1='{}/en-{}/train.{}'.format(indir,lang1,lang1) | |
| en_lang1_dict={} | |
| for en_line,il_line in zip(read_file(fname1),read_file(il_fname1)): | |
| if en_line in common_en_set: | |
| en_lang1_dict[en_line]=il_line | |
| os.makedirs("{}/{}-{}".format(outdir, lang1, lang2), exist_ok=True) | |
| out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format( | |
| o=outdir, l1=lang1, l2=lang2 | |
| ) | |
| out_l2_fname = "{o}/{l1}-{l2}/train.{l2}".format( | |
| o=outdir, l1=lang1, l2=lang2 | |
| ) | |
| il_fname2 = "{}/en-{}/train.{}".format(indir, lang2, lang2) | |
| with open(out_l1_fname, "w", encoding="utf-8") as out_l1_file, open( | |
| out_l2_fname, "w", encoding="utf-8" | |
| ) as out_l2_file: | |
| for en_line, il_line in zip(read_file(fname2), read_file(il_fname2)): | |
| if en_line in en_lang1_dict: | |
| # this block should be used if you want to consider multiple tranlations. | |
| for il_line_lang1 in en_lang1_dict[en_line]: | |
| # lang1_line, lang2_line = il_line_lang1, il_line | |
| # out_l1_file.write(lang1_line + "\n") | |
| # out_l2_file.write(lang2_line + "\n") | |
| # this block should be used if you DONT to consider multiple translation. | |
| lang1_line, lang2_line = en_lang1_dict[en_line], il_line | |
| out_l1_file.write(lang1_line+'\n') | |
| out_l2_file.write(lang2_line+'\n') | |
| def get_extracted_stats(outdir, LANGS): | |
| """ | |
| gathers stats from the extracted directories | |
| outdir: output directory to store mined data for each pair. | |
| One directory is created for each pair. | |
| LANGS: list of languages in the corpus (other than languages). | |
| The language codes must correspond to the ones used in the | |
| files and directories in indir. Prefarably, sort the languages | |
| in this list in alphabetic order. outdir will contain data for xx-yy, | |
| """ | |
| common_stats = [] | |
| for i in tqdm(range(len(LANGS) - 1)): | |
| for j in range(i + 1, len(LANGS)): | |
| lang1 = LANGS[i] | |
| lang2 = LANGS[j] | |
| out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format( | |
| o=outdir, l1=lang1, l2=lang2 | |
| ) | |
| cnt = sum([1 for _ in read_file(out_l1_fname)]) | |
| common_stats.append((lang1, lang2, cnt)) | |
| common_stats.append((lang2, lang1, cnt)) | |
| return common_stats | |