File size: 10,526 Bytes
4a08ba7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
#!/usr/bin/env python3

import re
import argparse
import sys
import csv
import traceback
from collections import deque, defaultdict
from math import inf


def sniff(filestream):
    ##sample = csv.Sniffer().sniff(filestream.read(1024))
    sample = filestream.read(1024)
    sample = "\n".join(line for line in re.split("[\r\n]", sample) if not (line.startswith("#") or len(line) == 0))
    try:
        dialect = csv.Sniffer().sniff(sample)
    except Exception as ex:
        print("Could not determine delimiter type, proceedings as excel csv", file=sys.stderr)
        dialect = csv.get_dialect("excel")
        
    filestream.seek(0)
    ret = (line for line in filestream if not line.startswith("#"))
    return (ret, dialect)


class subrule(object):
    ##
    ## simple wrapper for sub rules
    ##
    def __init__(self, values, classes):
        for key in ["sfrom", "sto", "precede", "follow", "weight"]:
            value = values[key]

            ##
            ## handle classes (and subclasses)
            ##
            while re.search("{.*}", value):
                value = value.format(**classes)
            self.__dict__[key] = value

        self.weight = float(self.weight)
        self.sfrom = re.compile(self.sfrom)
        self.precede = re.compile(self.precede+"$")
        self.follow = re.compile("^"+self.follow)

    def subScore(self, sfrom, precede, follow):
        if self.sfrom.match(sfrom) and self.precede.search(precede) and self.follow.search(follow):
            return self.weight
        else:
            return None

    def sub(self, x):
        return self.sfrom.sub(self.sto, x)

    def __repr__(self):
        return repr(self.__dict__)

    def __lt__(self, other):
        if not isinstance(other, subrule):
            raise Exception("Incompatible types for comparison")
        return [self.weight] < [other.weight]
    
            
class alphabet2ipa(object):
    ##
    ## interpretation of rules files
    ##

    classes = None
    subs = None
    ##chartr = None
    
    def __init__(self, langrules, missing="@", loglevel=0):
        self.classes = dict()
        self.subs = set()
        self.ipasubs = set()
        self.words = dict()
        self.pre = str.maketrans("", "")
        self.NO_TRANSLATE = missing
        self.loglevel = loglevel

        
        with langrules as csvsource:
            ##rules = csv.DictReader(csvsource)
            (csvsource, dialect) = sniff(csvsource)
            rules = csv.DictReader(csvsource, dialect=dialect)
            for rule in rules:
                if self.loglevel > 2:
                    print("Rule found:", rule, file=sys.stderr)

                try:
                    
                    ##
                    ## Pre equivalences
                    ##
                    if rule["type"] == "pre":
                        self.pre = str.maketrans(rule["sfrom"], rule["sto"])

                    ##
                    ## Deal with classes
                    ##
                    elif rule["type"] == "class":
                        self.classes[rule["sfrom"]] = rule["sto"]

                    ##
                    ## Deal with sub rules
                    ##
                    elif rule["type"] == "sub":
                        newrule = subrule(rule, self.classes)
                        self.subs.add(newrule)

                    ##
                    ## Deal with IPA sub rules
                    ##
                    elif rule["type"] == "ipasub":
                        newrule = subrule(rule, self.classes)
                        self.ipasubs.add(newrule)

                    ##
                    ## Deal with whole word substitutions
                    ##
                    elif rule["type"] == "word":
                        self.words[rule["sfrom"]] = rule["sto"].split()


                    ##
                    ## No such rule
                    ##
                    else:
                        print("Unrecognized rule type ({type}), with sfrom={sfrom}, and sto={sto}".format(**rule), file=sys.stderr)
                        continue

                except Exception as ex:
                    errInfo = sys.exc_info()
                    traceback.print_exception(*errInfo)
                    print("Error processing rule, but resuming processing other rules. Rule details: {}".format(rule, ex), file=sys.stderr)
                    continue

                    
                if self.loglevel > 1:
                    print("Rule added:", rule, file=sys.stderr)




    def translate(self, source):
        ##
        ## fully translated words
        ##
        if source in self.words:
            return self.words[source]

        ##
        ## preprocess using pre, and turn to lowercase
        ##
        source = source.translate(self.pre).lower()

        ##
        ## If there are character-based translations, apply them first
        ## (for Cyrillic and other non-latin scripts)
        ##
        ##if not self.chartr is None:
        ##    source = source.translate(self.chartr)

        sourceList = re.findall(".", source)
        targetList = deque()
        for (sx, sfrom) in enumerate(sourceList):
            ##
            ## prepare context
            ##
            precede = "".join(source[:sx])
            follow = "".join(source[sx+1:])

            ##
            ## perform all possible translations
            ##
            translations = [(rule.subScore(sfrom, precede, follow), rule.sub(sfrom)) for rule in self.subs]

            ##
            ## Exclude translations that didn't apply
            ##
            translations = [pair for pair in translations if not pair[0] is None]

            ##
            ## Choose best translation
            ##
            translation = sorted(translations)[-1][-1] if len(translations) > 0 else self.NO_TRANSLATE

            if len(translation) == 0:
                continue
            
            targetList.append(translation)

        targetString = " ".join(targetList)
        for (weight, rule) in sorted((-rule.weight, rule)
                                     for rule in self.ipasubs):
            targetString = rule.sub(targetString)
        


        return list(targetString.split())


    def check(self, cfile):
        ##
        ## Check that words translate as they should. Returns True if
        ## they all do (or if there are no words). Unless logevel is
        ## negative, mismatches are printed.
        ##


        ##
        ## Open the file, regardless of csv formats, excluding comment lines
        ##
        (csvsource, dialect) = sniff(cfile)
        data = csv.reader(csvsource, dialect=dialect)


        ##
        ## Iterate over all lines
        ##
        allGood = True                                     ## default is that it's all good
        for values in data:
            try:
               word = values[0]
               shouldbe = values[1]
            except Exception as ex:
                errInfo = sys.exc_info()
                allGood = False
                traceback.print_exception(*errInfo)
                print("Error processing verification statement, but resuming processing other statements. Statement details: {}".format(values, ex), file=sys.stderr)
                continue
            
            translation = " ".join(self.translate(word))   ## translate returns a list, change to spaces
            if self.loglevel > 2:
                print("Does '{}' translate to '{}'?".format(word, shouldbe), file=sys.stderr)

            if not shouldbe == translation:                ## if wrong translation
                allGood = False                            ##    not all translations are good
                if self.loglevel >= 0:
                    print("Verification error, '{}' was translated to '{}', not '{}'"
                          .format(word, translation, shouldbe)
                          , file=sys.stderr)

            if self.loglevel > 0:
                print("Word '{} ({})' translated to '{}'".format(word, shouldbe, translation)
                      , file=sys.stderr)
                    
        return allGood

def concatenate(*seqs):
    for seq in seqs:
        for item in seq:
            yield item
    

def main(argv):
    parser = argparse.ArgumentParser("Translate words to ipa")
    
    ##
    ## Specifies the rules used for trsnslation
    ##
    parser.add_argument("-l", "--langrules", dest="langrules",
                        default="es.rules", type=argparse.FileType('r', encoding="utf8"),
                        help="language code rules file")

    ##
    ## Specifies the log level
    ## 
    parser.add_argument("-v", "--verbose", dest="loglevel", 
                        default=0, type=int, 
                        help="Error level specification")


    ##
    ## Specifies a verifcation file, in any csv format. Headers are
    ## not expected.  The first columns is supposed to be a word, and
    ## the second is its ideal translation
    ##
    parser.add_argument("-c", "--check", dest="check",
                        default=None, type=argparse.FileType('r', encoding="utf8"),
                        help="file to use for verification")

    ##
    ## Allows to read date from some file (which should not be compressed)
    ##
    parser.add_argument("-r", "--read", dest="read",
                        default=None, type=argparse.FileType('r', encoding="utf8"),
                        help="file used for translation (read up to first space)")

    ##
    ## Any following words would be translated
    ##
    parser.add_argument("words", nargs="*")
    
    options = vars(parser.parse_args(argv))

    a2ipa = alphabet2ipa(options["langrules"], loglevel=options["loglevel"])
    ##print(options)

    if "check" in options and not options["check"] is None:
        allGood = a2ipa.check(options["check"])
        if not allGood:
            print("Verification failed, not processing additional data", file=sys.stderr)
            return []


    if "read" in options and not options["read"] is None:
        readwords = (fields.replace(",", " ").split()[0] for fields in options["read"])
    else:
        readwords = []
            
    words = concatenate(options["words"], readwords)
    
    ret = ((word, " ".join(a2ipa.translate(word)))
           for  word in words)

    return ret



if __name__ == "__main__":
    for output in main(sys.argv[1:]):
        print("\t".join(output))