XPF / Code /translate04.py

XPF

4a08ba7 verified 3 months ago

11.4 kB

	#!/usr/bin/env python3

	import re
	import argparse
	import sys
	import csv
	import traceback
	from collections import deque, defaultdict
	from math import inf


	def sniff(filestream):
	##sample = csv.Sniffer().sniff(filestream.read(1024))
	lines = list(line for line in filestream if not (line.startswith("#") or len(line) == 0))
	if all(line.find("\t") >= 0 for line in lines):
	dialect = csv.get_dialect("excel-tab")
	else:
	sample = "\n".join(lines)
	try:
	dialect = csv.Sniffer().sniff(sample)
	except Exception as ex:
	print("Could not determine delimiter type, proceedings as excel csv", file=sys.stderr)
	dialect = csv.get_dialect("excel")
	return (lines, dialect)


	class subrule(object):
	##
	## simple wrapper for sub rules
	##
	def __init__(self, values, classes):
	for key in ["sfrom", "sto", "precede", "follow", "weight"]:
	value = values[key]

	##
	## handle classes (and subclasses)
	##
	while re.search("{.*}", value):
	value = value.format(**classes)
	self.__dict__[key] = value

	self.weight = float(self.weight)
	self.sfrom = re.compile(self.sfrom)
	self.precede = re.compile(self.precede+"$")
	self.follow = re.compile("^"+self.follow)

	def subScore(self, sfrom, precede, follow):
	if self.sfrom.match(sfrom) and self.precede.search(precede) and self.follow.search(follow):
	return self.weight
	else:
	return None

	def sub(self, x):
	return self.sfrom.sub(self.sto, x)

	def __repr__(self):
	return repr(self.__dict__)

	def __lt__(self, other):
	if not isinstance(other, subrule):
	raise Exception("Incompatible types for comparison")
	return [self.weight] < [other.weight]


	class alphabet2ipa(object):
	##
	## interpretation of rules files
	##

	classes = None
	subs = None
	##chartr = None

	def __init__(self, langrules, missing="@", loglevel=0):
	self.classes = dict()
	self.subs = set()
	self.ipasubs = set()
	self.words = dict()
	self.matches = dict()
	self.pre = str.maketrans("", "")
	self.NO_TRANSLATE = missing
	self.loglevel = loglevel


	with langrules as csvsource:
	##rules = csv.DictReader(csvsource)
	(csvsource, dialect) = sniff(csvsource)
	rules = csv.DictReader(csvsource, dialect=dialect)
	for rule in rules:
	if self.loglevel > 2:
	print("Rule found:", rule, file=sys.stderr)

	try:

	##
	## Pre equivalences
	##
	if rule["type"] == "pre":
	self.pre = str.maketrans(rule["sfrom"], rule["sto"])

	##
	## Deal with classes
	##
	elif rule["type"] == "class":
	self.classes[rule["sfrom"]] = rule["sto"]

	##
	## Deal with match rules
	##
	elif rule["type"] == "match":
	value = rule["sto"]
	while re.search("{.*}", value):
	value = value.format(**self.classes)
	self.matches[rule["sfrom"]] = value

	##
	## Deal with sub rules
	##
	elif rule["type"] == "sub":
	newrule = subrule(rule, self.classes)
	self.subs.add(newrule)

	##
	## Deal with IPA sub rules
	##
	elif rule["type"] == "ipasub":
	newrule = subrule(rule, self.classes)
	self.ipasubs.add(newrule)

	##
	## Deal with whole word substitutions
	##
	elif rule["type"] == "word":
	self.words[rule["sfrom"]] = rule["sto"].split()


	##
	## No such rule
	##
	else:
	print("Unrecognized rule type ({type}), with sfrom={sfrom}, and sto={sto}".format(**rule), file=sys.stderr)
	continue

	except Exception as ex:
	errInfo = sys.exc_info()
	traceback.print_exception(*errInfo)
	print("Error processing rule, but resuming processing other rules. Rule details: {}".format(rule, ex), file=sys.stderr)
	continue


	if self.loglevel > 1:
	print("Rule added:", rule, file=sys.stderr)




	def translate(self, source):
	##
	## fully translated words
	##
	if source in self.words:
	return self.words[source]

	##
	## preprocess using pre, and turn to lowercase
	##
	source = source.translate(self.pre).lower()

	##
	## If there are character-based translations, apply them first
	## (for Cyrillic and other non-latin scripts)
	##
	##if not self.chartr is None:
	## source = source.translate(self.chartr)

	sourceList = re.findall(".", source)
	targetList = deque()
	for (sx, sfrom) in enumerate(sourceList):

	##
	## If there's a match rule: translate "as is" (and skip costly regular expressions)
	##
	if sfrom in self.matches:
	translation = self.matches[sfrom]

	##
	## Otherwise, look for all matches
	##
	else:
	##
	## prepare context
	##
	precede = "".join(source[:sx])
	follow = "".join(source[sx+1:])

	##
	## perform all possible translations
	##
	translations = [(rule.subScore(sfrom, precede, follow), rule.sub(sfrom)) for rule in self.subs]

	##
	## Exclude translations that didn't apply
	##
	translations = [pair for pair in translations if not pair[0] is None]

	##
	## Choose best translation
	##
	translation = sorted(translations)[-1][-1] if len(translations) > 0 else self.NO_TRANSLATE

	if len(translation) == 0:
	continue

	targetList.append(translation)

	targetString = " ".join(targetList)
	for (weight, rule) in sorted((-rule.weight, rule)
	for rule in self.ipasubs):
	targetString = rule.sub(targetString)



	return list(targetString.split())


	def check(self, cfile):
	##
	## Check that words translate as they should. Returns True if
	## they all do (or if there are no words). Unless logevel is
	## negative, mismatches are printed.
	##


	##
	## Open the file, regardless of csv formats, excluding comment lines
	##
	(csvsource, dialect) = sniff(cfile)
	data = csv.reader(csvsource, dialect=dialect)


	##
	## Iterate over all lines
	##
	allGood = True ## default is that it's all good
	for values in data:
	try:
	word = values[0]
	shouldbe = values[1]
	except Exception as ex:
	errInfo = sys.exc_info()
	allGood = False
	traceback.print_exception(*errInfo)
	print("Error processing verification statement, but resuming processing other statements. Statement details: {}".format(values, ex), file=sys.stderr)
	continue

	translation = " ".join(self.translate(word)) ## translate returns a list, change to spaces
	if self.loglevel > 2:
	print("Does '{}' translate to '{}'?".format(word, shouldbe), file=sys.stderr)

	if not shouldbe == translation: ## if wrong translation
	allGood = False ## not all translations are good
	if self.loglevel >= 0:
	print("Verification error, '{}' was translated to '{}', not '{}'"
	.format(word, translation, shouldbe)
	, file=sys.stderr)

	if self.loglevel > 0:
	print("Word '{} ({})' translated to '{}'".format(word, shouldbe, translation)
	, file=sys.stderr)

	return allGood

	def concatenate(*seqs):
	for seq in seqs:
	for item in seq:
	yield item


	def main(argv):
	parser = argparse.ArgumentParser("Translate words to ipa")

	##
	## Specifies the rules used for trsnslation
	##
	parser.add_argument("-l", "--langrules", dest="langrules",
	default="es.rules", type=argparse.FileType('r', encoding="utf8"),
	help="language code rules file")

	##
	## Specifies the log level
	##
	parser.add_argument("-v", "--verbose", dest="loglevel",
	default=0, type=int,
	help="Error level specification")


	##
	## Specifies a verifcation file, in any csv format. Headers are
	## not expected. The first columns is supposed to be a word, and
	## the second is its ideal translation
	##
	parser.add_argument("-c", "--check", dest="check",
	default=None, type=argparse.FileType('r', encoding="utf8"),
	help="file to use for verification")

	##
	## Allows to read date from some file (which should not be compressed)
	##
	parser.add_argument("-r", "--read", dest="read",
	default=None, type=argparse.FileType('r', encoding="utf8"),
	help="file used for translation (read up to first space)")

	##
	## Any following words would be translated
	##
	parser.add_argument("words", nargs="*")

	options = vars(parser.parse_args(argv))

	a2ipa = alphabet2ipa(options["langrules"], loglevel=options["loglevel"])
	##print(options)

	if "check" in options and not options["check"] is None:
	allGood = a2ipa.check(options["check"])
	if not allGood:
	print("Verification failed, not processing additional data", file=sys.stderr)
	return []


	if "read" in options and not options["read"] is None:
	readwords = (fields.replace(",", " ").split()[0] for fields in options["read"])
	else:
	readwords = []

	words = concatenate(options["words"], readwords)

	ret = ((word, " ".join(a2ipa.translate(word)))
	for word in words)

	return ret



	if __name__ == "__main__":
	for output in main(sys.argv[1:]):
	print("\t".join(output))