|
|
import fileinput |
|
|
import os |
|
|
import platform |
|
|
from subprocess import Popen, PIPE |
|
|
|
|
|
|
|
|
|
|
|
class Aligner(): |
|
|
def __init__(self, config_folder, source_lang, target_lang, temp_folder): |
|
|
os.makedirs(temp_folder, exist_ok=True) |
|
|
forward_params_path = os.path.join(config_folder, f"{source_lang}-{target_lang}.params") |
|
|
reverse_params_path = os.path.join(config_folder, f"{target_lang}-{source_lang}.params") |
|
|
|
|
|
fwd_T, fwd_m = self.__read_err(os.path.join(config_folder, f"{source_lang}-{target_lang}.err")) |
|
|
rev_T, rev_m = self.__read_err(os.path.join(config_folder, f"{target_lang}-{source_lang}.err")) |
|
|
|
|
|
self.forward_alignment_file_path = os.path.join(temp_folder, "forward.align") |
|
|
self.reverse_alignment_file_path = os.path.join(temp_folder, "reverse.align") |
|
|
|
|
|
if platform.system().lower() == "windows": |
|
|
fastalign_bin = "fast_align.exe" |
|
|
atools_bin = "atools.exe" |
|
|
else: |
|
|
fastalign_bin = "./fast_align" |
|
|
atools_bin = "./atools" |
|
|
|
|
|
self.temp_file_path = os.path.join(temp_folder, "tokenized_sentences_to_align.txt") |
|
|
|
|
|
self.forward_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", fwd_T, "-m", fwd_m, "-f", |
|
|
forward_params_path] |
|
|
self.reverse_command = [fastalign_bin, "-i", self.temp_file_path, "-d", "-T", rev_T, "-m", rev_m, "-f", |
|
|
reverse_params_path, "r"] |
|
|
|
|
|
self.symmetric_command = [atools_bin, "-i", self.forward_alignment_file_path, "-j", |
|
|
self.reverse_alignment_file_path, "-c", "grow-diag-final-and"] |
|
|
|
|
|
def __simplify_alignment_file(self, file): |
|
|
with fileinput.FileInput(file, inplace=True, backup='.bak') as f: |
|
|
for line in f: |
|
|
print(line.split('|||')[2].strip()) |
|
|
|
|
|
def __read_err(self, err): |
|
|
(T, m) = ('', '') |
|
|
for line in open(err): |
|
|
|
|
|
if 'expected target length' in line: |
|
|
m = line.split()[-1] |
|
|
|
|
|
elif 'final tension' in line: |
|
|
T = line.split()[-1] |
|
|
return T, m |
|
|
|
|
|
def align_sentences(self, original_sentences, translated_sentences): |
|
|
|
|
|
with open(self.temp_file_path, "w") as temp_file: |
|
|
for original, translated in zip(original_sentences, translated_sentences): |
|
|
temp_file.write(f"{original} ||| {translated}\n") |
|
|
|
|
|
|
|
|
with open(self.forward_alignment_file_path, 'w') as f_out, open(self.reverse_alignment_file_path, 'w') as r_out: |
|
|
fw_process = Popen(self.forward_command, stdout=f_out) |
|
|
|
|
|
r_process = Popen(self.reverse_command, stdout=r_out) |
|
|
|
|
|
|
|
|
fw_process.wait() |
|
|
r_process.wait() |
|
|
|
|
|
|
|
|
self.__simplify_alignment_file(self.forward_alignment_file_path) |
|
|
self.__simplify_alignment_file(self.reverse_alignment_file_path) |
|
|
|
|
|
|
|
|
process = Popen(self.symmetric_command, stdin=PIPE, stdout=PIPE, stderr=PIPE) |
|
|
process.wait() |
|
|
|
|
|
|
|
|
alignments_str = process.communicate()[0].decode('utf-8') |
|
|
alignments = [] |
|
|
for line in alignments_str.splitlines(): |
|
|
alignments.append([(int(i), int(j)) for i, j in [pair.split("-") for pair in line.strip("\n").split(" ")]]) |
|
|
|
|
|
return alignments |
|
|
|