NabuOCR / training /convert_atf.py

Upload training code

819ced0 verified 14 days ago

12.8 kB

	import re
	from collections import Counter
	from typing import Optional


	class ParsedATF:
	"""Represents a parsed ATF document with methods to extract data."""

	# Face types
	ALL_FACES = [
	"obverse",
	"reverse",
	"left",
	"right",
	"top",
	"bottom",
	]

	def __init__(
	self, transliterations: dict, unicodes: dict, info: dict, used_signs: set
	):
	"""
	Initialize parsed ATF data.

	Args:
	transliterations: Dictionary mapping face names to transliteration line lists
	unicodes: Dictionary mapping face names to unicode line lists
	info: Metadata dictionary (e.g., language)
	"""
	self._transliterations = transliterations
	self._unicodes = unicodes
	self._info = info
	self._used_signs = used_signs

	def get_used_signs(self) -> set[str]:
	"""Get the set of used signs."""
	return self._used_signs

	def get_transliteration(self, face: str) -> Optional[str]:
	"""
	Get the transliteration for a given face.

	Args:
	face: The face name (e.g., 'obverse', 'reverse')

	Returns:
	The transliteration as a string with lines separated by newlines,
	or None if the face has no content
	"""
	if face in self._transliterations:
	return self._transliterations[face]
	return None

	def get_unicode(self, face: str) -> Optional[str]:
	"""
	Get the unicode representation for a given face.

	Args:
	face: The face name (e.g., 'obverse', 'reverse')

	Returns:
	The unicode representation as a string with lines separated by newlines,
	or None if the face has no content
	"""
	if face in self._unicodes:
	return self._unicodes[face]
	return None

	def get_all_unicodes(self) -> dict[str, Optional[str]]:
	"""
	Get unicode for all faces.

	Returns:
	Dictionary mapping face names to unicode strings
	"""
	return {
	f"{face}_unicode": self.get_unicode(face)
	for face in self.ALL_FACES
	if self.get_unicode(face) is not None
	}

	def get_all_transliterations(self) -> dict[str, Optional[str]]:
	"""
	Get transliteration for all faces.

	Returns:
	Dictionary mapping face names to transliteration strings
	"""
	return {
	f"{face}_transliteration": self.get_transliteration(face)
	for face in self.ALL_FACES
	if self.get_transliteration(face) is not None
	}

	@property
	def info(self) -> dict:
	"""Get parsing info (e.g., language)."""
	return self._info


	class ATFConverter:
	"""Converter for ATF (ASCII Transliteration Format) cuneiform text."""

	# Face types
	ALL_FACES = [
	"obverse",
	"reverse",
	"left",
	"right",
	"top",
	"bottom",
	]

	FACE_REMAPPING = {
	"surface a": "obverse",
	"surface b": "reverse",
	}

	# Special tokens
	SPECIAL_TOKENS = [
	"<B>", # broken
	"<M>", # missing one or more token?
	"<S>", # blank space
	"<D>", # divine
	"<munus>", # young woman, or woman
	"<ansze>",
	"<ki>",
	"<disz>",
	"x", # unknown signs
	]

	def __init__(self, token_path: str = "./data/cuneiform_vocab.tsv"):
	"""
	Initialize the ATF converter.

	Args:
	token_path: Path to the cuneiform vocabulary file
	"""
	self.text2sign = self._load_token_mapping(token_path)

	# Counters for statistics
	self.vocab_freq = Counter()
	self.new_tokens = Counter()
	self.langs = Counter()
	self.unknown_faces = Counter()

	def _load_token_mapping(self, token_path: str) -> tuple[dict, dict]:
	"""Load the text to sign and sign to text mappings."""

	text2sign = {}
	for t in open(token_path).readlines():
	try:
	k, s = t.strip("\n").split("\t")
	except:
	print(t)
	continue
	text2sign[k] = s.replace(" ", "")

	return text2sign

	def _remove_at(self, x: str) -> Optional[str]:
	"""Remove @c or @t suffixes from tokens."""
	if x.endswith("@c)") or x.endswith("@t)"):
	return x[:-3] + ")"
	return None

	def _remove_spaces(self, x: list[str]) -> list[str]:
	"""Remove consecutive space tokens."""
	new_x = []
	for item in x:
	if item == "<S>" and len(new_x) > 0 and new_x[-1] == "<S>":
	continue
	new_x.append(item)
	return new_x

	def parse(self, raw_text: str) -> Optional[ParsedATF]:
	"""
	Parse ATF text and extract transliterations and unicode.

	Args:
	raw_text: The raw ATF text to parse

	Returns:
	ParsedATF object if parsing succeeded, None if the language is not supported
	"""
	token_text = {"default": []}
	info = {}

	curr_face = "default"
	sep = "\n"
	if "\\n" in raw_text:
	sep = "\\n"

	for line in raw_text.split(sep):
	line = line.strip()

	if line.startswith("&") or line.startswith("'&"):
	# metadata
	pass
	elif line.startswith("#atf"):
	info["lang"] = line.split("lang ")[-1].strip()
	self.langs[info["lang"]] += 1
	if info["lang"] not in ["sux", "akk", "sux, akk", "akk _sux"]:
	# do not process those not sux or akk
	return None
	elif (
	line.startswith("#")
	or line.startswith(">>")
	or line.startswith("<<")
	or line.startswith("\|\|")
	):
	# comment/link
	continue
	elif line.startswith("$"):
	if "broken" in line:
	try:
	token_text[curr_face].append("<B>")
	except:
	continue
	elif line.startswith("@"):
	key = line[1:].strip().strip("?")
	if key in self.ALL_FACES:
	curr_face = key
	token_text[key] = []
	elif key.startswith("column"):
	token_text[curr_face].append("<COL>")
	else:
	self.unknown_faces[key] += 1
	else:
	# Process line content
	self._process_line_content(line, curr_face, token_text)

	# Build transliterations and unicodes from token_text
	transliterations, unicodes, used_signs = self._build_outputs(token_text)
	return ParsedATF(transliterations, unicodes, info, used_signs)

	def _process_line_content(self, line: str, curr_face: str, token_text: dict):
	"""Process a content line and extract tokens."""
	# Special symbols
	line = line.replace("{d}", "<D>")

	for x in re.findall(r"\{.*?\}", line):
	line = line.replace(x, " " + x[1:-1] + " ")

	line = line.replace("($ blank space $)", "<S>")

	# Remove underscore
	line = line.replace("_", " ")

	# Remove ending hash #
	line = line.replace("#", "")

	# Remove question mark, exclamation mark
	line = line.replace("?", "")
	line = line.replace("!", "")

	# Remove [] and ()
	for x in re.findall(r"\[.*?\]", line):
	line = line.replace(x, "")

	line = line.split(". ")

	if len(line) >= 2:
	# Make sure only leading line number is split
	if len(line) > 2:
	line = line[0], ". ".join(line[1:])

	line_num, text = line
	if curr_face != "":
	tokens = text.split(" ")
	signs = []
	for i, t in enumerate(tokens):
	# if i > 0 and len(signs) > 0:
	# signs.append("<S>") # insert a space between words

	if "-" in t:
	ts = t.split("-")
	for x in ts:
	x = x.strip()
	if len(x) == 0:
	continue
	if x in self.text2sign:
	self.vocab_freq[x] += 1
	signs.append(self.text2sign[x])
	else:
	new_x = self._remove_at(x)
	if new_x and new_x in self.text2sign:
	signs.append(self.text2sign[new_x])
	else:
	self.new_tokens[x] += 1
	elif t in self.text2sign:
	signs.append(self.text2sign[t])
	elif t in self.SPECIAL_TOKENS:
	self.vocab_freq[t] += 1
	signs.append(t)
	else:
	new_x = self._remove_at(t)
	if new_x and new_x in self.text2sign:
	signs.append(self.text2sign[new_x])
	else:
	if len(t.strip()) > 0:
	self.new_tokens[t] += 1

	signs = self._remove_spaces(signs)
	token_text[curr_face].append(
	{"raw": text, "num": line_num, "sign": signs}
	)

	def _build_outputs(
	self, token_text: dict
	) -> tuple[dict[str, list[list[str]]], dict[str, list[list[str]]], set[str]]:
	"""Build transliterations and unicode outputs from parsed token_text."""
	transliterations = {}
	unicodes = {}
	used_signs = set()

	for face in token_text.keys():
	lines = token_text[face]
	face_key = self.FACE_REMAPPING.get(face, face)

	# List of columns, each column is a list of lines
	face_transliterations: list[list[str]] = []
	face_unicodes: list[list[str]] = []

	current_column = {"transliteration": [], "unicode": []}

	for line in lines:
	if line == "<COL>":
	if len(current_column["transliteration"]) > 0:
	face_transliterations.append(current_column["transliteration"])
	if len(current_column["unicode"]) > 0:
	face_unicodes.append(current_column["unicode"])
	current_column = {"transliteration": [], "unicode": []}
	continue

	if type(line) == str:
	continue

	used_signs.update(line.get("sign", ["<B>"]))

	current_column["transliteration"].append(line.get("raw", "<B>"))
	current_column["unicode"].append(" ".join(line.get("sign", ["<B>"])))

	if len(current_column["transliteration"]) > 0:
	face_transliterations.append(current_column["transliteration"])
	if len(current_column["unicode"]) > 0:
	face_unicodes.append(current_column["unicode"])

	if len(face_transliterations) == 1:
	# No need for column markers as there is only one column
	transliterations[face_key] = "\n".join(face_transliterations[0])
	else:
	transliterations[face_key] = "\n".join(
	[
	f"@column {i+1}\n" + "\n".join(column)
	for i, column in enumerate(face_transliterations)
	]
	)

	if len(face_unicodes) == 1:
	# No need for column markers as there is only one column
	unicodes[face_key] = "\n".join(face_unicodes[0])
	else:
	unicodes[face_key] = "\n".join(
	[
	f"@column {i+1}\n" + "\n".join(column)
	for i, column in enumerate(face_unicodes)
	]
	)

	return transliterations, unicodes, used_signs