File size: 2,639 Bytes
02b96b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os
from subprocess import Popen, PIPE
import re

def moses_to_file(translated_moses_file: str, source_lang: str, target_lang: str, tikal_folder: str,
                      original_xliff_file_path: str):
    # put the translations into the xlf
    tikal_moses_to_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-lm", original_xliff_file_path, "-sl",
                                    source_lang, "-tl", target_lang, "-from", translated_moses_file, "-totrg",
                                    "-noalttrans", "-to", original_xliff_file_path]
    Popen(tikal_moses_to_xliff_command).wait()

    # any tags that are still <g> have not been paired between original and translated texts by tikal so we remove
    # them. This may happen if a word in the original language has been split in more that one words that have other
    # words in between, or an error in fastalign
    text = open(original_xliff_file_path).read()
    result = re.sub(r'<g id="\d+">(.*?)</g>', r'\1', text)
    open(original_xliff_file_path, "w").write(result)

    # merge into a docx again
    tikal_merge_doc_command = [os.path.join(tikal_folder, "tikal.sh"), "-m", original_xliff_file_path]
    final_process = Popen(tikal_merge_doc_command, stdout=PIPE, stderr=PIPE)
    stdout, stderr = final_process.communicate()
    final_process.wait()

    # get the path to the output file
    output = stdout.decode('utf-8')
    return re.search(r'(?<=Output:\s)(.*)', output)[0]

def file_to_moses(input_file: str, source_lang: str, target_lang: str, tikal_folder: str,
                      original_xliff_file_path: str) -> str:
    """
    Given a document, this function generates an xliff file and then a plain text file with the text contents
    while keeping style and formatting using tags like <g id=1> </g>

    Parameters:
    input_file: Path to document to process
    source_lang: Source language of the document
    target_lang: Target language of the document
    tikal_folder: Folder where tikal.sh is located
    original_xliff_file_path: Path to xliff file to generate, which will be use later

    Returns:
    string: Path to plain text file
    """

    tikal_xliff_command = [os.path.join(tikal_folder, "tikal.sh"), "-x", input_file, "-nocopy", "-sl", source_lang,
                           "-tl", target_lang]
    Popen(tikal_xliff_command).wait()

    tikal_moses_command = [os.path.join(tikal_folder, "tikal.sh"), "-xm", original_xliff_file_path, "-sl", source_lang,
                           "-tl", target_lang]
    Popen(tikal_moses_command).wait()

    return os.path.join(original_xliff_file_path + f".{source_lang}")