File size: 2,639 Bytes
02b96b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import os
from subprocess import Popen, PIPE
import re
def moses_to_file(translated_moses_file: str, source_lang: str, target_lang: str, tikal_folder: str,
original_xliff_file_path: str):
# put the translations into the xlf
tikal_moses_to_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-lm", original_xliff_file_path, "-sl",
source_lang, "-tl", target_lang, "-from", translated_moses_file, "-totrg",
"-noalttrans", "-to", original_xliff_file_path]
Popen(tikal_moses_to_xliff_command).wait()
# any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
# them. This may happen if a word in the original language has been split in more that one words that have other
# words in between, or an error in fastalign
text = open(original_xliff_file_path).read()
result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
open(original_xliff_file_path, "w").write(result)
# merge into a docx again
tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file_path]
final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
stdout, stderr = final_process.communicate()
final_process.wait()
# get the path to the output file
output = stdout.decode('utf-8')
return re.search(r'(?<=Output:\s)(.*)', output)[0]
def file_to_moses(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
original_xliff_file_path: str) -> str:
"""
Given a document, this function generates an xliff file and then a plain text file with the text contents
while keeping style and formatting using tags like <g id=1> </g>
Parameters:
input_file: Path to document to process
source_lang: Source language of the document
target_lang: Target language of the document
tikal_folder: Folder where tikal.sh is located
original_xliff_file_path: Path to xliff file to generate, which will be use later
Returns:
string: Path to plain text file
"""
tikal_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-x", input_file, "-nocopy", "-sl", source_lang,
"-tl", target_lang]
Popen(tikal_xliff_command).wait()
tikal_moses_command = [os.path.join(tikal_folder, "tikal.sh"), "-xm", original_xliff_file_path, "-sl", source_lang,
"-tl", target_lang]
Popen(tikal_moses_command).wait()
return os.path.join(original_xliff_file_path + f".{source_lang}") |