| from typing import NamedTuple, List |
| from urllib.parse import urlparse |
| import os, sys |
| import subprocess |
| from subprocess import check_call, check_output |
| import glob |
| import wget |
| import re |
| import multiprocessing as mp |
| from functools import partial |
| import pathlib |
| from collections import OrderedDict |
|
|
| WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) |
|
|
| if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): |
| print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') |
| sys.exit(-1) |
|
|
| |
| CWD = os.getcwd() |
| UTILS = f"{CWD}/utils" |
|
|
| MOSES = f"{UTILS}/mosesdecoder" |
| SGM_TOOL = f'{MOSES}/scripts/ems/support/input-from-sgm.perl' |
|
|
| TMX2CORPUS = f"{UTILS}/tmx2corpus" |
| TMX_TOOL = f'python {TMX2CORPUS}/tmx2corpus.py' |
|
|
| to_data_path = f'{WORKDIR_ROOT}/wmt' |
| download_to = f'{to_data_path}/downloads' |
| manually_downloads = f'{to_data_path}/downloads' |
| extract_to = f'{to_data_path}/extracted' |
| |
| raw_data = f'{WORKDIR_ROOT}/ML50/raw' |
| |
|
|
| class DLDataset(NamedTuple): |
| name: str |
| train_urls: List[str] |
| valid_urls: List[str] |
| test_urls: List[str] |
| train_files_patterns: List[str] = [] |
| valid_files_patterns: List[str] = [] |
| test_files_patterns: List[str] = [] |
|
|
|
|
|
|
| def bar_custom(current, total, width=80): |
| print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r') |
|
|
| def get_downloaded_file(dl_folder, url): |
| if isinstance(url, tuple): |
| url, f = url |
| else: |
| url_f = urlparse(url) |
| |
| f = '_'.join(url_f.path.split('/')[1:]) |
| return url, f"{dl_folder}/{f}" |
|
|
| def download_parts_and_combine(dl_folder, urls, filename): |
| parts = [] |
| for url_record in urls: |
| url, part_file = get_downloaded_file(dl_folder, url_record) |
| if os.path.exists(part_file): |
| print(f'{part_file} has already been downloaded so skip') |
| else: |
| part_file = wget.download(url, part_file, bar=bar_custom) |
| parts.append(part_file) |
|
|
| def get_combine_cmd(parts): |
| |
| return f'cat {" ".join(parts)} > {filename}' |
|
|
| combine_cmd = get_combine_cmd(parts) |
| call(combine_cmd, debug=True) |
| return filename |
|
|
| def download_a_url(dl_folder, url): |
| url, filename = get_downloaded_file(dl_folder, url) |
| if os.path.exists(filename): |
| print(f'{filename} has already been downloaded so skip') |
| return filename |
|
|
| print(f'downloading {url} to {filename}') |
| if isinstance(url, list) or isinstance(url, tuple): |
| download_parts_and_combine(dl_folder, url, filename) |
| else: |
| wget.download(url, filename, bar=bar_custom) |
| print(f'dowloaded: {filename}') |
| return filename |
|
|
| def download_files(dl_folder, urls, completed_urls={}): |
| for url_record in urls: |
| url, _ = get_downloaded_file(dl_folder, url_record) |
| filename = download_a_url(dl_folder, url_record) |
| completed_urls[str(url)] = filename |
| return completed_urls |
|
|
| def check_need_manual_downalod(dl_folder, to_manually_download_urls): |
| to_be_manually_dowloaded = [] |
| manually_completed_urls = {} |
| for url_record, instruction in to_manually_download_urls: |
| url, filename = get_downloaded_file(dl_folder, url_record) |
| if not os.path.exists(filename): |
| print(f'{url} need to be download manually, please download it manually following {instruction}; and copy it to {filename}') |
| to_be_manually_dowloaded.append((url, filename)) |
| else: |
| manually_completed_urls[url] = filename |
| |
| |
| return to_be_manually_dowloaded |
| |
| def download_dataset(to_folder, dl_dataset, completed_urls={}): |
| download_files(to_folder, dl_dataset.train_urls, completed_urls) |
| download_files(to_folder, dl_dataset.valid_urls, completed_urls) |
| download_files(to_folder, dl_dataset.test_urls, completed_urls) |
| print('completed downloading') |
| return completed_urls |
|
|
| def call(cmd, debug=False): |
| if debug: |
| print(cmd) |
| check_call(cmd, shell=True) |
|
|
| |
| def get_extract_name(file_path): |
| path = os.path.split(file_path) |
| return path[-1] + '_extract' |
|
|
| def extract_file(downloaded_file, extract_folder, get_extract_name=get_extract_name, debug=False): |
| extract_name = get_extract_name(downloaded_file) |
| extract_to = f'{extract_folder}/{extract_name}' |
| os.makedirs(extract_to, exist_ok=True) |
| if os.path.exists(f'{extract_to}/DONE'): |
| print(f'{downloaded_file} has already been extracted to {extract_to} so skip') |
| return extract_to |
| def get_extract_cmd(filename): |
| if filename.endswith('.tgz') or filename.endswith('tar.gz'): |
| return f'tar xzfv {filename} -C {extract_to}' |
| elif filename.endswith('.gz.tar'): |
| return f'tar xfv {filename} -C {extract_to}; (cd {extract_to}; gzip -d *.gz; [ $? -eq 0 ] || gzip -d */*.gz)' |
| elif filename.endswith('.tar'): |
| return f'tar xfv {filename} -C {extract_to}' |
| elif filename.endswith('.gz'): |
| return f'cp {filename} {extract_to}; (cd {extract_to}; gzip -d *.gz)' |
| elif filename.endswith('.zip'): |
| return f'unzip {filename} -d {extract_to}' |
| extract_cmd = get_extract_cmd(downloaded_file) |
| print(f'extracting {downloaded_file}') |
| if isinstance(extract_cmd, list): |
| for c in extract_cmd: |
| call(c, debug=debug) |
| else: |
| call(extract_cmd, debug=debug) |
| call(f'echo DONE > {extract_to}/DONE') |
| return extract_to |
|
|
|
|
| def extract_all_files( |
| completed_urls, extract_folder, |
| get_extract_name=get_extract_name, |
| completed_extraction={}, |
| debug=False): |
| extracted_folders = OrderedDict() |
| for url, downloaded_file in set(completed_urls.items()): |
| if downloaded_file in completed_extraction: |
| print(f'{downloaded_file} is already extracted; so skip') |
| continue |
| folder = extract_file(downloaded_file, extract_folder, get_extract_name, debug) |
| extracted_folders[url] = folder |
| return extracted_folders |
|
|
|
|
| def my_glob(folder): |
| for p in [f'{folder}/*', f'{folder}/*/*', f'{folder}/*/*/*']: |
| for f in glob.glob(p): |
| yield f |
|
|
|
|
| def sgm2raw(sgm, debug): |
| to_file = sgm[0:len(sgm) - len('.sgm')] |
| if os.path.exists(to_file): |
| debug and print(f'{sgm} already converted to {to_file}; so skip') |
| return to_file |
| cmd = f'{SGM_TOOL} < {sgm} > {to_file}' |
| call(cmd, debug) |
| return to_file |
|
|
| def tmx2raw(tmx, debug): |
| to_file = tmx[0:len(tmx) - len('.tmx')] |
| to_folder = os.path.join(*os.path.split(tmx)[:-1]) |
| if os.path.exists(f'{to_folder}/bitext.en'): |
| debug and print(f'{tmx} already extracted to {to_file}; so skip') |
| return to_file |
| cmd = f'(cd {to_folder}; {TMX_TOOL} {tmx})' |
| call(cmd, debug) |
| return to_file |
|
|
| CZENG16_REGEX = re.compile(r'.*?data.plaintext-format/0[0-9]train$') |
| WMT19_WIKITITLES_REGEX = re.compile(r'.*?wikititles-v1.(\w\w)-en.tsv.gz') |
| TSV_REGEX = re.compile(r'.*?(\w\w)-(\w\w).tsv$') |
|
|
|
|
|
|
| def cut_wikitles(wiki_file, debug): |
| |
| if wiki_file.endswith('wiki/fi-en/titles.fi-en'): |
| to_file1 = f'{wiki_file}.fi' |
| to_file2 = f'{wiki_file}.en' |
| BACKSLASH = '\\' |
| cmd1 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f1 |awk '{{$1=$1}};1' > {to_file1}" |
| cmd2 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f2 |awk '{{$1=$1}};1' > {to_file2}" |
| |
| |
| |
| |
| |
| |
| else: |
| return None |
| if os.path.exists(to_file1) and os.path.exists(to_file2): |
| debug and print(f'{wiki_file} already processed to {to_file1} and {to_file2}; so skip') |
| return wiki_file |
|
|
| call(cmd1, debug=debug) |
| call(cmd2, debug=debug) |
| return wiki_file |
|
|
| def cut_tsv(file, debug): |
| m = TSV_REGEX.match(file) |
| if m is None: |
| raise ValueError(f'{file} is not matching tsv pattern') |
| src = m.groups()[0] |
| tgt = m.groups()[1] |
|
|
| to_file1 = f'{file}.{src}' |
| to_file2 = f'{file}.{tgt}' |
| cmd1 = f"cat {file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}" |
| cmd2 = f"cat {file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}" |
| if os.path.exists(to_file1) and os.path.exists(to_file2): |
| debug and print(f'{file} already processed to {to_file1} and {to_file2}; so skip') |
| return file |
|
|
| call(cmd1, debug=debug) |
| call(cmd2, debug=debug) |
| return file |
|
|
| |
| def convert_file_if_needed(file, debug): |
| if file.endswith('.sgm'): |
| return sgm2raw(file, debug) |
| elif file.endswith('.tmx'): |
| return tmx2raw(file, debug) |
| elif file.endswith('wiki/fi-en/titles.fi-en'): |
| return cut_wikitles(file, debug) |
| |
| |
| elif file.endswith('.tsv'): |
| return cut_tsv(file, debug) |
| elif CZENG16_REGEX.match(file): |
| return convert2czeng17(file, debug) |
| else: |
| return file |
|
|
|
|
| def convert_files_if_needed(extracted_foldrs, my_glob=my_glob, debug=False): |
| return { |
| url: list(sorted(set(convert_file_if_needed(f, debug)) for f in sorted(set(my_glob(folder))))) |
| for url, folder in extracted_foldrs.items() |
| } |
| |
| def match_patt(file_path, file_pattern, src, tgt, lang): |
| return file_pattern.format(src=src, tgt=tgt, lang=lang) in file_path |
|
|
| def match_patts(file_path, file_patterns, src, tgt, lang): |
| for file_pattern in file_patterns: |
| params = { k: v for k, v in [('src', src), ('tgt', tgt), ('lang', lang)] if k in file_pattern} |
| matching = file_pattern.format(**params) |
|
|
| if isinstance(file_pattern, tuple): |
| pattern, directions = file_pattern |
| if f'{src}-{tgt}' in directions and matching in file_path: |
| return True |
| else: |
| if matching in file_path: |
| return True |
| return False |
|
|
| def extracted_glob(extracted_folder, file_patterns, src, tgt, lang): |
| def get_matching_pattern(file_pattern): |
| params = { |
| k: v |
| for k, v in [('src', src), ('tgt', tgt), ('lang', lang)] |
| if '{' + k + '}' in file_pattern |
| } |
| file_pattern = re.sub(r'{src:(.*?)}', r'\1' if lang == src else '', file_pattern) |
| file_pattern = re.sub(r'{tgt:(.*?)}', r'\1' if lang == tgt else '', file_pattern) |
| file_pattern = file_pattern.format(**params) |
| return file_pattern |
| for file_pattern in file_patterns: |
| if isinstance(file_pattern, tuple): |
| file_pattern, lang_pairs = file_pattern |
| if f'{src}-{tgt}' not in lang_pairs: |
| continue |
| |
| matching_pattern = get_matching_pattern(file_pattern) |
| if matching_pattern is None: |
| continue |
| glob_patterns = f'{extracted_folder}/{matching_pattern}' |
| |
| for f in glob.glob(glob_patterns): |
| yield f |
|
|
| |
| def all_extracted_files(split, src, tgt, extracted_folders, split_urls): |
| def get_url(url): |
| if isinstance(url, tuple): |
| url, downloaded_file = url |
| return url |
| return [ |
| f |
| for url in split_urls |
| for f in my_glob(extracted_folders[str(get_url(url))]) |
| ] |
|
|
| def concat_files(split, src, tgt, extracted_folders, split_urls, path_patterns, to_folder, debug=False): |
| |
| |
| |
| for lang in [src, tgt]: |
| to_file = f'{to_folder}/{split}.{src}-{tgt}.{lang}' |
| s_src, s_tgt, s_lang = src.split('_')[0], tgt.split('_')[0], lang.split('_')[0] |
| files = [] |
| for url in split_urls: |
| if isinstance(url, tuple): |
| url, downloaded_file = url |
| if str(url) not in extracted_folders: |
| print(f'warning: {url} not in extracted files') |
| for extracted_file in set( |
| extracted_glob( |
| extracted_folders[str(url)], path_patterns, |
| s_src, s_tgt, s_lang)): |
| files.append(extracted_file) |
| if len(files) == 0: |
| print('warning: ', f'No files found for split {to_file}') |
| continue |
| files = sorted(set(files)) |
| print(f'concating {len(files)} files into {to_file}') |
| cmd = ['cat'] + [f'"{f}"' for f in files] + [f'>{to_file}'] |
| cmd = " ".join(cmd) |
| call(cmd, debug=debug) |
|
|
| UTILS = os.path.join(pathlib.Path(__file__).parent, 'utils') |
| LID_MODEL = f'{download_to}/lid.176.bin' |
| LID_MULTI = f'{UTILS}/fasttext_multi_filter.py' |
|
|
| def lid_filter(split, src, tgt, from_folder, to_folder, debug=False): |
| if not os.path.exists(LID_MODEL): |
| call(f'wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O {LID_MODEL}') |
| from_prefix = f'{from_folder}/{split}.{src}-{tgt}' |
| to_prefix = f'{to_folder}/{split}.{src}-{tgt}' |
| if os.path.exists(f'{from_prefix}.{src}') and os.path.exists(f'{from_prefix}.{tgt}'): |
| s_src, s_tgt = src.split('_')[0], tgt.split('_')[0] |
| cmd = ( |
| f'python {LID_MULTI} --model {LID_MODEL} --inputs {from_prefix}.{src} {from_prefix}.{tgt} ' |
| f'--langs {s_src} {s_tgt} --outputs {to_prefix}.{src} {to_prefix}.{tgt}' |
| ) |
| print(f'filtering {from_prefix}') |
| call(cmd, debug=debug) |
|
|
| def concat_into_splits(dl_dataset, src, tgt, extracted_folders, to_folder, debug): |
| to_folder_tmp = f"{to_folder}_tmp" |
| os.makedirs(to_folder_tmp, exist_ok=True) |
| concat_files('train', src, tgt, |
| extracted_folders, |
| split_urls=dl_dataset.train_urls, |
| path_patterns=dl_dataset.train_files_patterns, |
| to_folder=to_folder_tmp, debug=debug) |
| lid_filter('train', src, tgt, to_folder_tmp, to_folder, debug) |
|
|
| concat_files('valid', src, tgt, |
| extracted_folders, |
| split_urls=dl_dataset.valid_urls, |
| path_patterns=dl_dataset.valid_files_patterns, |
| to_folder=to_folder, debug=debug) |
| concat_files('test', src, tgt, |
| extracted_folders, |
| split_urls=dl_dataset.test_urls, |
| path_patterns=dl_dataset.test_files_patterns, |
| to_folder=to_folder, debug=debug) |
| |
|
|
| def download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=False): |
| pool = mp.Pool(processes=num_processes) |
| download_f = partial(download_a_url, dl_folder) |
| downloaded_files = pool.imap_unordered(download_f, urls) |
| pool.close() |
| pool.join() |
|
|
| BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ") |
| def run_eval_bleu(cmd): |
| output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip() |
| print(output) |
| bleu = -1.0 |
| for line in output.strip().split('\n'): |
| m = BLEU_REGEX.search(line) |
| if m is not None: |
| bleu = m.groups()[0] |
| bleu = float(bleu) |
| break |
| return bleu |
|
|
| def check_wmt_test_bleu(raw_folder, wmt_lang_pairs): |
| not_matchings = [] |
| for wmt, src_tgts in wmt_lang_pairs: |
| for src_tgt in src_tgts: |
| print(f'checking test bleus for: {src_tgt} at {wmt}') |
| src, tgt = src_tgt.split('-') |
| ssrc, stgt = src[:2], tgt[:2] |
| if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'): |
| |
| test_src = f'{raw_folder}/test.{tgt}-{src}.{src}' |
| else: |
| test_src = f'{raw_folder}/test.{src}-{tgt}.{src}' |
| cmd1 = f'cat {test_src} | sacrebleu -t "{wmt}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""' |
| test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}' |
| cmd2 = f'cat {test_tgt} | sacrebleu -t "{wmt}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""' |
| bleu1 = run_eval_bleu(cmd1) |
| if bleu1 != 100.0: |
| not_matchings.append(f'{wmt}:{src_tgt} source side not matching: {test_src}') |
| bleu2 = run_eval_bleu(cmd2) |
| if bleu2 != 100.0: |
| not_matchings.append(f'{wmt}:{src_tgt} target side not matching: {test_tgt}') |
| return not_matchings |
| |
| def download_and_extract( |
| to_folder, lang_pairs, dl_dataset, |
| to_manually_download_urls, |
| completed_urls={}, completed_extraction={}, |
| debug=False): |
|
|
| dl_folder = f'{to_folder}/downloads' |
| extract_folder = f'{to_folder}/extracted' |
| raw_folder = f'{to_folder}/raw' |
| lid_filtered = f'{to_folder}/lid_filtered' |
|
|
| os.makedirs(extract_folder, exist_ok=True) |
| os.makedirs(raw_folder, exist_ok=True) |
| os.makedirs(lid_filtered, exist_ok=True) |
|
|
| |
| to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls) |
|
|
| completed_urls = download_dataset( |
| dl_folder, dl_dataset, completed_urls) |
| if debug: |
| print('completed urls: ', completed_urls) |
| |
|
|
| extracted_folders = extract_all_files( |
| completed_urls, |
| extract_folder=extract_folder, |
| completed_extraction=completed_extraction, |
| debug=debug) |
| if debug: |
| print('download files have been extracted to folders: ', extracted_folders) |
|
|
| converted_files = convert_files_if_needed(extracted_folders, debug=False) |
| for src_tgt in lang_pairs: |
| print(f'working on {dl_dataset.name}: {src_tgt}') |
| src, tgt = src_tgt.split('-') |
| concat_into_splits(dl_dataset, |
| src=src, tgt=tgt, |
| extracted_folders=extracted_folders, |
| to_folder=raw_folder, debug=debug) |
| print('completed data into: ', raw_folder) |
|
|
| def download_czang16(download_to, username=None): |
| wgets = [ |
| f'wget --user={username} --password=czeng -P {download_to} http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar' |
| for i in range(10)] |
| cmds = [] |
| for i, cmd in enumerate(wgets): |
| filename = f'{download_to}/data-plaintext-format.{i}.tar' |
| if os.path.exists(filename): |
| print(f'{filename} has already been downloaded; so skip') |
| continue |
| cmds.append(cmd) |
| if cmds and username is None: |
| raise ValueError('No czeng username is given; please register at http://ufal.mff.cuni.cz/czeng/czeng16 to obtain username to download') |
| for cmd in cmds: |
| call(cmd) |
| print('done with downloading czeng1.6') |
|
|
| def download_czeng17_script(download_to, extract_folder, debug=False): |
| url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip' |
| filename = f'{download_to}/convert_czeng16_to_17.pl.zip' |
| extract_to = f'{extract_folder}/{get_extract_name(filename)}' |
| script_path = f'{extract_to}/convert_czeng16_to_17.pl' |
| |
| if not os.path.exists(script_path): |
| wget.download(url, filename, bar=bar_custom) |
| extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug) |
| return script_path |
|
|
| czeng17_script_path = "" |
| def convert2czeng17(file, debug): |
| en_file = f'{file}.en' |
| cs_file = f'{file}.cs' |
| |
| if not os.path.exists(en_file) or not os.path.exists(cs_file): |
| cs_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f3 > {cs_file}' |
| en_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f4 > {en_file}' |
| call(cs_cmd, debug) |
| call(en_cmd, debug) |
| else: |
| print(f'already extracted: {en_file} and {cs_file}') |
| return file |
|
|
| def extract_czeng17(extract_folder, debug=False): |
| url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip' |
| filename = f'{download_to}/convert_czeng16_to_17.pl.zip' |
| extract_to = f'{extract_folder}/{get_extract_name(filename)}' |
| script_path = f'{extract_to}/convert_czeng16_to_17.pl' |
| |
| if not os.path.exists(script_path): |
| wget.download(url, filename, bar=bar_custom) |
| extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug) |
| return script_path |
|
|
| |
| |
| |
| |
| |
| wmt13_es_en = DLDataset( |
| name='wmt13_es-en', |
| train_urls=[ |
| 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz', |
| 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', |
| 'http://www.statmt.org/wmt13/training-parallel-un.tgz', |
| 'http://www.statmt.org/wmt13/training-parallel-nc-v8.tgz', |
| ], |
| valid_urls=[ |
| ('http://www.statmt.org/wmt13/dev.tgz', 'wmt13_dev.tgz') |
| ], |
| test_urls=[ |
| ('http://www.statmt.org/wmt13/test.tgz', 'wmt13_test.tgz') |
| ], |
| train_files_patterns=[ |
| ('*/europarl-v7.{src}-{tgt}.{lang}', ['es-en']), |
| ('*commoncrawl.{src}-{tgt}.{lang}', ['es-en']), |
| ('*/news-commentary-v8.{src}-{tgt}.{lang}', ['es-en']), |
| ('un/*undoc.2000.{src}-{tgt}.{lang}', ['es-en']), |
| ] , |
| valid_files_patterns=[ |
| ('dev/newstest2012.{lang}', ['es-en']) |
| ], |
| test_files_patterns=[ |
| ('test/newstest*.{lang}', ['es-en']) |
| ], |
| ) |
|
|
| wmt14_de_fr_en = DLDataset( |
| name='wmt14_de_fr_en', |
| train_urls=[ |
| 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz', |
| 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', |
| 'http://www.statmt.org/wmt13/training-parallel-un.tgz', |
| 'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz', |
| ('http://www.statmt.org/wmt10/training-giga-fren.tar', 'training-giga-fren.gz.tar'), |
| ], |
| valid_urls=[ |
| ('http://www.statmt.org/wmt14/dev.tgz', 'wmt14_dev.tgz'), |
| ], |
| test_urls=[ |
| ('http://www.statmt.org/wmt14/test-full.tgz', 'wmt14_test_full.tgz'), |
| ], |
| train_files_patterns=[ |
| ('*/europarl-v7.{src}-{tgt}.{lang}', ['fr-en', 'de-en']), |
| ('*commoncrawl.{src}-{tgt}.{lang}', ['fr-en', 'de-en']), |
| ('*/*news-commentary-v9.{src}-{tgt}.{lang}', ['fr-en', 'de-en']), |
| ('un/undoc.2000.{src}-{tgt}.{lang}', ['fr-en']), |
| ('*giga-{src}{tgt}*{lang}', ['fr-en']) |
| ], |
| valid_files_patterns=[ |
| ('dev/newstest2013.{lang}', ['fr-en', 'de-en']) |
| ], |
| test_files_patterns=[ |
| ('test-full/newstest*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['en-de', 'de-en', 'fr-en', 'en-fr']), |
| ], |
| ) |
|
|
| |
| wmt16_ro_en = DLDataset( |
| name='wmt16_ro-en', |
| train_urls=[ |
| ('http://data.statmt.org/wmt16/translation-task/training-parallel-ep-v8.tgz', 'wmt16_training-parallel-ep-v8.tgz'), |
| ('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-ro.tmx.gz', 'en-ro.tmx.gz'), |
| ], |
| valid_urls=[ |
| ('http://data.statmt.org/wmt16/translation-task/dev-romanian-updated.tgz', 'wmt16_dev.tgz') |
| ], |
| test_urls=[ |
| ('http://data.statmt.org/wmt16/translation-task/test.tgz', 'wmt16_test.tgz') |
| ], |
| train_files_patterns=[ |
| ('*/*europarl-v8.{src}-{tgt}.{lang}', ['ro-en']), |
| ('bitext.{lang}', ['ro-en']) |
| ] , |
| valid_files_patterns=[ |
| ('dev/newsdev2016*{src}{tgt}*.{lang}', ['ro-en', 'ro-en']) |
| ], |
| test_files_patterns=[ |
| ('test/newstest*{src}{tgt}*.{lang}', ['ro-en', 'en-ro']) |
| ], |
| ) |
|
|
| cwmt_wmt_instruction = 'cwmt download instruction at: http://nlp.nju.edu.cn/cwmt-wmt' |
| wmt17_fi_lv_tr_zh_en_manual_downloads = [ |
| |
| ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'), cwmt_wmt_instruction), |
| ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'), cwmt_wmt_instruction), |
| ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'), cwmt_wmt_instruction), |
| ( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'), cwmt_wmt_instruction), |
| ( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'), cwmt_wmt_instruction), |
| ( ('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'), cwmt_wmt_instruction), |
| ] |
| wmt17_fi_lv_tr_zh_en = DLDataset( |
| name='wmt17_fi_lv_tr_zh_en', |
| train_urls=[ |
| ('http://data.statmt.org/wmt17/translation-task/training-parallel-ep-v8.tgz', 'wmt17_training-parallel-ep-v8.tgz'), |
| 'http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz', |
| 'http://www.statmt.org/wmt15/wiki-titles.tgz', |
| ('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-tr.tmx.gz', 'en-tr.tmx.gz'), |
| ('http://data.statmt.org/wmt17/translation-task/rapid2016.tgz', 'wmt17_rapid2016.tgz'), |
| 'http://data.statmt.org/wmt17/translation-task/leta.v1.tgz', |
| 'http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz', |
| 'http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz', |
| (('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00', |
| 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01',), 'UNv1.0.en-zh.tar.gz'), |
| |
| ('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'), |
| ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'), |
| ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'), |
| ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'), |
| ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'), |
| ('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'), |
| ], |
| valid_urls=[ |
| ('http://data.statmt.org/wmt17/translation-task/dev.tgz', 'wmt17_dev.tgz'), |
| ], |
| test_urls=[ |
| |
| ('http://data.statmt.org/wmt17/translation-task/test-update-1.tgz', 'wmt17_test_zh_en.tgz'), |
| ('http://data.statmt.org/wmt17/translation-task/test.tgz', 'wmt17_test_others.tgz') |
| ], |
| train_files_patterns=[ |
| ('casict*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ), |
| ('casia*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ), |
| ('dataum*/Book*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en']), |
| ('neu*/NEU*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en'] ), |
| ('*/*UNv1.0.en-zh.{src:zh}{tgt:en}', ['zh-en']), |
| ('training/*news-commentary-v12.{src}-{tgt}.{lang}', ['zh-en', ]), |
| |
| ('*/*europarl-v8.{src}-{tgt}.{lang}', ['fi-en', 'lv-en']), |
| ('wiki/fi-en/titles.{src}-{tgt}.{lang}', ['fi-en', ]), |
| ('rapid2016.{tgt}-{src}.{lang}', ['fi-en', 'lv-en']), |
| ('*/leta.{lang}', ['lv-en']), |
| ('*/dcep.{lang}', ['lv-en']), |
| ('*/farewell.{lang}', ['lv-en']), |
| ('bitext.{lang}', ['tr-en']), |
| ] , |
| valid_files_patterns=[ |
| ('dev/newsdev2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}', |
| [ |
| 'fi-en', 'lv-en', 'tr-en', 'zh-en', |
| 'en-fi', 'en-lv', 'en-tr', 'en-zh' |
| ]), |
| ('dev/newstest2016*{src}{tgt}-{src:src}{tgt:ref}.{lang}', |
| [ |
| 'fi-en', 'tr-en', |
| 'en-fi', 'en-tr', |
| ]), |
| ], |
| test_files_patterns=[ |
| ('test/newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}', |
| [ |
| 'fi-en', 'lv-en', 'tr-en', |
| 'en-fi', 'en-lv', 'en-tr', |
| ]), |
| ('newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}', |
| [ |
| 'zh-en', |
| 'en-zh' |
| ]), |
| ], |
| ) |
|
|
| czeng_instruction = 'download instruction at: http://ufal.mff.cuni.cz/czeng/czeng16' |
| |
| wmt18_cs_et_en_manual_downloads = [ |
| |
| |
| |
| |
| |
| ] |
|
|
| wmt18_cs_et_en = DLDataset( |
| name='wmt18_cs_et_en', |
| train_urls=[ |
| 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz', |
| 'http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz', |
| 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-cs.zipporah0-dedup-clean.tgz', |
| 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-et.zipporah0-dedup-clean.tgz', |
| 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', |
| 'http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz', |
| ('http://data.statmt.org/wmt18/translation-task/rapid2016.tgz', 'wmt18_rapid2016.tgz'), |
| |
| |
| |
| |
| |
| |
| ], |
| valid_urls=[ |
| ('http://data.statmt.org/wmt18/translation-task/dev.tgz', 'wmt18_dev.tgz'), |
| ], |
| test_urls=[ |
| ('http://data.statmt.org/wmt18/translation-task/test.tgz', 'wmt18_test.tgz'), |
| ], |
| train_files_patterns=[ |
| |
| ('*/*europarl-v8.{src}-{tgt}.{lang}', ['et-en']), |
| |
| ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['et-en']), |
| |
| |
| |
| ('rapid2016.{tgt}-{src}.{lang}', ['et-en']), |
| ] , |
| valid_files_patterns=[ |
| ('dev/newsdev2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['et-en']), |
| |
| ], |
| test_files_patterns=[ |
| ('test/newstest2018-{src}{tgt}-{src:src}{tgt:ref}.{lang}', |
| |
| ['et-en']), |
| ] |
| ) |
|
|
| ru_en_yandex_instruction = 'Yandex Corpus download instruction at: https://translate.yandex.ru/corpus?lang=en' |
| wmt19_ru_gu_kk_lt_manual_downloads = [ |
| (('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'), ru_en_yandex_instruction) |
| ] |
| wmt19_ru_gu_kk_lt = DLDataset( |
| name='wmt19_ru_gu_kk_lt', |
| train_urls=[ |
| 'http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz', |
| 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-lt.bicleaner07.tmx.gz', |
| 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz', |
| 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', |
| 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14-wmt19.en-kk.tsv.gz', |
| 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz', |
| 'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz', |
| 'http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz', |
| 'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz', |
| 'http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz', |
| 'http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz', |
| (('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00', |
| 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01', |
| 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02',), |
| 'wmt19_UNv1.0.en-ru.tar.gz'), |
| 'https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2016.en-lt.tmx.zip', |
| ('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'), |
| ], |
| valid_urls=[ |
| ('http://data.statmt.org/wmt19/translation-task/dev.tgz', 'wmt19_dev.tgz'), |
| ], |
| test_urls=[ |
| ('http://data.statmt.org/wmt19/translation-task/test.tgz', 'wmt19_test.tgz'), |
| ], |
| train_files_patterns=[ |
| ('*europarl-v9.{src}-{tgt}.tsv.{lang}', ['lt-en']), |
| |
| ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['ru-en']), |
| ('bitext.{lang}', ['lt-en',]), |
| ('*commoncrawl.{src}-{tgt}.{lang}', ['ru-en',]), |
| ('*news-commentary-v14-wmt19.{tgt}-{src}.tsv.{lang}', ['kk-en', ]), |
| ('*news-commentary-v14.{tgt}-{src}.tsv.{lang}', ['ru-en']), |
| |
| ('corpus.{tgt}_{src}.1m.{lang}', ['ru-en']), |
| ('wikititles_v1_wikititles-v1.{src}-{tgt}.tsv.{lang}', ['ru-en', 'kk-en', 'lt-en', 'gu-en']), |
| ('*/UNv1.0.{tgt}-{src}.{lang}', ['ru-en']), |
| |
| ('bitext.{lang}', ['lt-en']) |
| ], |
| valid_files_patterns=[ |
| ('dev/newsdev2019*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['gu-en', 'kk-en', 'lt-en']), |
| ('dev/newstest2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['ru-en']), |
| ], |
| test_files_patterns=[ |
| ('sgm/newstest2019-{src}{tgt}-{src:src}{tgt:ref}.{lang}', |
| ['ru-en', 'gu-en', 'kk-en', 'lt-en', 'en-ru', 'en-gu', 'en-kk', 'en-lt']), |
| ] |
| ) |
|
|
|
|
| |
|
|
| if __name__ == "__main__": |
| |
| dl_folder = f'{to_data_path}/downloads' |
| extract_folder = f'{to_data_path}/extracted' |
|
|
| urls = [ |
| url |
| for dataset in [wmt13_es_en, wmt14_de_fr_en, wmt16_ro_en, wmt18_cs_et_en, wmt19_ru_gu_kk_lt] |
| for urls in [dataset.train_urls, dataset.valid_urls, dataset.test_urls] |
| for url in urls |
| ] |
| urls = set(urls) |
| download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=True) |
|
|
| |
| to_manually_download_urls = ( |
| wmt17_fi_lv_tr_zh_en_manual_downloads + wmt18_cs_et_en_manual_downloads + wmt19_ru_gu_kk_lt_manual_downloads |
| ) |
| to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls) |
| if len(to_be_manually_dowloaded) > 0: |
| print('Missing files that need to be downloaded manually; stop the process now.') |
| exit(-1) |
| |
| completed_urls = {} |
| completed_extraction = {} |
| def work_on_wmt(directions, wmt_data): |
| download_and_extract( |
| to_data_path, |
| directions, |
| wmt_data, |
| to_manually_download_urls=to_manually_download_urls, |
| completed_urls=completed_urls, completed_extraction=completed_extraction, debug=True) |
| |
| work_on_wmt( |
| ['es_XX-en_XX'], |
| wmt13_es_en,) |
| work_on_wmt( |
| [ |
| 'fr_XX-en_XX', 'en_XX-fr_XX', |
| |
| ], |
| wmt14_de_fr_en,) |
| work_on_wmt( |
| ['ro_RO-en_XX', 'en_XX-ro_XX'], |
| wmt16_ro_en,) |
| work_on_wmt( |
| [ |
| |
| 'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX', |
| |
| |
| 'en_XX-lv_LV', 'en_XX-fi_FI', 'en_XX-tr_TR', |
| ], |
| wmt17_fi_lv_tr_zh_en, ) |
| |
| |
| work_on_wmt( |
| [ |
| |
| 'et_EE-en_XX'], |
| wmt18_cs_et_en,) |
| work_on_wmt( |
| [ |
| |
| 'gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX', |
| |
| 'en_XX-gu_IN', 'en_XX-kk_KZ', 'en_XX-lt_LT' |
| ], |
| wmt19_ru_gu_kk_lt,) |
|
|
| not_matching = check_wmt_test_bleu( |
| f'{to_data_path}/raw', |
| [ |
| ('wmt13', ['es_XX-en_XX']), |
| ('wmt14/full', ['fr_XX-en_XX',]), |
| ('wmt16', ['ro_RO-en_XX',]), |
| |
| ('wmt17', [ 'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX']), |
| ('wmt18', ['cs_CZ-en_XX', 'et_EE-en_XX']), |
| ('wmt19', ['gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX']), |
| |
| ] |
| ) |
| if len(not_matching) > 0: |
| print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching)) |
|
|
|
|