|
|
|
|
|
|
|
|
import io |
|
|
import json |
|
|
import subprocess |
|
|
|
|
|
|
|
|
pairs = [ |
|
|
["en", "ru"], |
|
|
["ru", "en"], |
|
|
["en", "de"], |
|
|
["de", "en"], |
|
|
] |
|
|
|
|
|
n_objs = 8 |
|
|
|
|
|
|
|
|
def get_all_data(pairs, n_objs): |
|
|
text = {} |
|
|
for src, tgt in pairs: |
|
|
pair = f"{src}-{tgt}" |
|
|
cmd = f"sacrebleu -t wmt19 -l {pair} --echo src".split() |
|
|
src_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines() |
|
|
cmd = f"sacrebleu -t wmt19 -l {pair} --echo ref".split() |
|
|
tgt_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines() |
|
|
text[pair] = {"src": src_lines[:n_objs], "tgt": tgt_lines[:n_objs]} |
|
|
return text |
|
|
|
|
|
|
|
|
text = get_all_data(pairs, n_objs) |
|
|
filename = "./fsmt_val_data.json" |
|
|
with io.open(filename, "w", encoding="utf-8") as f: |
|
|
bleu_data = json.dump(text, f, indent=2, ensure_ascii=False) |
|
|
|