XPF / Code /translate04.py
niobures's picture
XPF
4a08ba7 verified
#!/usr/bin/env python3
import re
import argparse
import sys
import csv
import traceback
from collections import deque, defaultdict
from math import inf
def sniff(filestream):
##sample = csv.Sniffer().sniff(filestream.read(1024))
lines = list(line for line in filestream if not (line.startswith("#") or len(line) == 0))
if all(line.find("\t") >= 0 for line in lines):
dialect = csv.get_dialect("excel-tab")
else:
sample = "\n".join(lines)
try:
dialect = csv.Sniffer().sniff(sample)
except Exception as ex:
print("Could not determine delimiter type, proceedings as excel csv", file=sys.stderr)
dialect = csv.get_dialect("excel")
return (lines, dialect)
class subrule(object):
##
## simple wrapper for sub rules
##
def __init__(self, values, classes):
for key in ["sfrom", "sto", "precede", "follow", "weight"]:
value = values[key]
##
## handle classes (and subclasses)
##
while re.search("{.*}", value):
value = value.format(**classes)
self.__dict__[key] = value
self.weight = float(self.weight)
self.sfrom = re.compile(self.sfrom)
self.precede = re.compile(self.precede+"$")
self.follow = re.compile("^"+self.follow)
def subScore(self, sfrom, precede, follow):
if self.sfrom.match(sfrom) and self.precede.search(precede) and self.follow.search(follow):
return self.weight
else:
return None
def sub(self, x):
return self.sfrom.sub(self.sto, x)
def __repr__(self):
return repr(self.__dict__)
def __lt__(self, other):
if not isinstance(other, subrule):
raise Exception("Incompatible types for comparison")
return [self.weight] < [other.weight]
class alphabet2ipa(object):
##
## interpretation of rules files
##
classes = None
subs = None
##chartr = None
def __init__(self, langrules, missing="@", loglevel=0):
self.classes = dict()
self.subs = set()
self.ipasubs = set()
self.words = dict()
self.matches = dict()
self.pre = str.maketrans("", "")
self.NO_TRANSLATE = missing
self.loglevel = loglevel
with langrules as csvsource:
##rules = csv.DictReader(csvsource)
(csvsource, dialect) = sniff(csvsource)
rules = csv.DictReader(csvsource, dialect=dialect)
for rule in rules:
if self.loglevel > 2:
print("Rule found:", rule, file=sys.stderr)
try:
##
## Pre equivalences
##
if rule["type"] == "pre":
self.pre = str.maketrans(rule["sfrom"], rule["sto"])
##
## Deal with classes
##
elif rule["type"] == "class":
self.classes[rule["sfrom"]] = rule["sto"]
##
## Deal with match rules
##
elif rule["type"] == "match":
value = rule["sto"]
while re.search("{.*}", value):
value = value.format(**self.classes)
self.matches[rule["sfrom"]] = value
##
## Deal with sub rules
##
elif rule["type"] == "sub":
newrule = subrule(rule, self.classes)
self.subs.add(newrule)
##
## Deal with IPA sub rules
##
elif rule["type"] == "ipasub":
newrule = subrule(rule, self.classes)
self.ipasubs.add(newrule)
##
## Deal with whole word substitutions
##
elif rule["type"] == "word":
self.words[rule["sfrom"]] = rule["sto"].split()
##
## No such rule
##
else:
print("Unrecognized rule type ({type}), with sfrom={sfrom}, and sto={sto}".format(**rule), file=sys.stderr)
continue
except Exception as ex:
errInfo = sys.exc_info()
traceback.print_exception(*errInfo)
print("Error processing rule, but resuming processing other rules. Rule details: {}".format(rule, ex), file=sys.stderr)
continue
if self.loglevel > 1:
print("Rule added:", rule, file=sys.stderr)
def translate(self, source):
##
## fully translated words
##
if source in self.words:
return self.words[source]
##
## preprocess using pre, and turn to lowercase
##
source = source.translate(self.pre).lower()
##
## If there are character-based translations, apply them first
## (for Cyrillic and other non-latin scripts)
##
##if not self.chartr is None:
## source = source.translate(self.chartr)
sourceList = re.findall(".", source)
targetList = deque()
for (sx, sfrom) in enumerate(sourceList):
##
## If there's a match rule: translate "as is" (and skip costly regular expressions)
##
if sfrom in self.matches:
translation = self.matches[sfrom]
##
## Otherwise, look for all matches
##
else:
##
## prepare context
##
precede = "".join(source[:sx])
follow = "".join(source[sx+1:])
##
## perform all possible translations
##
translations = [(rule.subScore(sfrom, precede, follow), rule.sub(sfrom)) for rule in self.subs]
##
## Exclude translations that didn't apply
##
translations = [pair for pair in translations if not pair[0] is None]
##
## Choose best translation
##
translation = sorted(translations)[-1][-1] if len(translations) > 0 else self.NO_TRANSLATE
if len(translation) == 0:
continue
targetList.append(translation)
targetString = " ".join(targetList)
for (weight, rule) in sorted((-rule.weight, rule)
for rule in self.ipasubs):
targetString = rule.sub(targetString)
return list(targetString.split())
def check(self, cfile):
##
## Check that words translate as they should. Returns True if
## they all do (or if there are no words). Unless logevel is
## negative, mismatches are printed.
##
##
## Open the file, regardless of csv formats, excluding comment lines
##
(csvsource, dialect) = sniff(cfile)
data = csv.reader(csvsource, dialect=dialect)
##
## Iterate over all lines
##
allGood = True ## default is that it's all good
for values in data:
try:
word = values[0]
shouldbe = values[1]
except Exception as ex:
errInfo = sys.exc_info()
allGood = False
traceback.print_exception(*errInfo)
print("Error processing verification statement, but resuming processing other statements. Statement details: {}".format(values, ex), file=sys.stderr)
continue
translation = " ".join(self.translate(word)) ## translate returns a list, change to spaces
if self.loglevel > 2:
print("Does '{}' translate to '{}'?".format(word, shouldbe), file=sys.stderr)
if not shouldbe == translation: ## if wrong translation
allGood = False ## not all translations are good
if self.loglevel >= 0:
print("Verification error, '{}' was translated to '{}', not '{}'"
.format(word, translation, shouldbe)
, file=sys.stderr)
if self.loglevel > 0:
print("Word '{} ({})' translated to '{}'".format(word, shouldbe, translation)
, file=sys.stderr)
return allGood
def concatenate(*seqs):
for seq in seqs:
for item in seq:
yield item
def main(argv):
parser = argparse.ArgumentParser("Translate words to ipa")
##
## Specifies the rules used for trsnslation
##
parser.add_argument("-l", "--langrules", dest="langrules",
default="es.rules", type=argparse.FileType('r', encoding="utf8"),
help="language code rules file")
##
## Specifies the log level
##
parser.add_argument("-v", "--verbose", dest="loglevel",
default=0, type=int,
help="Error level specification")
##
## Specifies a verifcation file, in any csv format. Headers are
## not expected. The first columns is supposed to be a word, and
## the second is its ideal translation
##
parser.add_argument("-c", "--check", dest="check",
default=None, type=argparse.FileType('r', encoding="utf8"),
help="file to use for verification")
##
## Allows to read date from some file (which should not be compressed)
##
parser.add_argument("-r", "--read", dest="read",
default=None, type=argparse.FileType('r', encoding="utf8"),
help="file used for translation (read up to first space)")
##
## Any following words would be translated
##
parser.add_argument("words", nargs="*")
options = vars(parser.parse_args(argv))
a2ipa = alphabet2ipa(options["langrules"], loglevel=options["loglevel"])
##print(options)
if "check" in options and not options["check"] is None:
allGood = a2ipa.check(options["check"])
if not allGood:
print("Verification failed, not processing additional data", file=sys.stderr)
return []
if "read" in options and not options["read"] is None:
readwords = (fields.replace(",", " ").split()[0] for fields in options["read"])
else:
readwords = []
words = concatenate(options["words"], readwords)
ret = ((word, " ".join(a2ipa.translate(word)))
for word in words)
return ret
if __name__ == "__main__":
for output in main(sys.argv[1:]):
print("\t".join(output))