|
|
import re
|
|
|
from collections import Counter
|
|
|
from typing import Optional
|
|
|
|
|
|
|
|
|
class ParsedATF:
|
|
|
"""Represents a parsed ATF document with methods to extract data."""
|
|
|
|
|
|
|
|
|
ALL_FACES = [
|
|
|
"obverse",
|
|
|
"reverse",
|
|
|
"left",
|
|
|
"right",
|
|
|
"top",
|
|
|
"bottom",
|
|
|
]
|
|
|
|
|
|
def __init__(
|
|
|
self, transliterations: dict, unicodes: dict, info: dict, used_signs: set
|
|
|
):
|
|
|
"""
|
|
|
Initialize parsed ATF data.
|
|
|
|
|
|
Args:
|
|
|
transliterations: Dictionary mapping face names to transliteration line lists
|
|
|
unicodes: Dictionary mapping face names to unicode line lists
|
|
|
info: Metadata dictionary (e.g., language)
|
|
|
"""
|
|
|
self._transliterations = transliterations
|
|
|
self._unicodes = unicodes
|
|
|
self._info = info
|
|
|
self._used_signs = used_signs
|
|
|
|
|
|
def get_used_signs(self) -> set[str]:
|
|
|
"""Get the set of used signs."""
|
|
|
return self._used_signs
|
|
|
|
|
|
def get_transliteration(self, face: str) -> Optional[str]:
|
|
|
"""
|
|
|
Get the transliteration for a given face.
|
|
|
|
|
|
Args:
|
|
|
face: The face name (e.g., 'obverse', 'reverse')
|
|
|
|
|
|
Returns:
|
|
|
The transliteration as a string with lines separated by newlines,
|
|
|
or None if the face has no content
|
|
|
"""
|
|
|
if face in self._transliterations:
|
|
|
return self._transliterations[face]
|
|
|
return None
|
|
|
|
|
|
def get_unicode(self, face: str) -> Optional[str]:
|
|
|
"""
|
|
|
Get the unicode representation for a given face.
|
|
|
|
|
|
Args:
|
|
|
face: The face name (e.g., 'obverse', 'reverse')
|
|
|
|
|
|
Returns:
|
|
|
The unicode representation as a string with lines separated by newlines,
|
|
|
or None if the face has no content
|
|
|
"""
|
|
|
if face in self._unicodes:
|
|
|
return self._unicodes[face]
|
|
|
return None
|
|
|
|
|
|
def get_all_unicodes(self) -> dict[str, Optional[str]]:
|
|
|
"""
|
|
|
Get unicode for all faces.
|
|
|
|
|
|
Returns:
|
|
|
Dictionary mapping face names to unicode strings
|
|
|
"""
|
|
|
return {
|
|
|
f"{face}_unicode": self.get_unicode(face)
|
|
|
for face in self.ALL_FACES
|
|
|
if self.get_unicode(face) is not None
|
|
|
}
|
|
|
|
|
|
def get_all_transliterations(self) -> dict[str, Optional[str]]:
|
|
|
"""
|
|
|
Get transliteration for all faces.
|
|
|
|
|
|
Returns:
|
|
|
Dictionary mapping face names to transliteration strings
|
|
|
"""
|
|
|
return {
|
|
|
f"{face}_transliteration": self.get_transliteration(face)
|
|
|
for face in self.ALL_FACES
|
|
|
if self.get_transliteration(face) is not None
|
|
|
}
|
|
|
|
|
|
@property
|
|
|
def info(self) -> dict:
|
|
|
"""Get parsing info (e.g., language)."""
|
|
|
return self._info
|
|
|
|
|
|
|
|
|
class ATFConverter:
|
|
|
"""Converter for ATF (ASCII Transliteration Format) cuneiform text."""
|
|
|
|
|
|
|
|
|
ALL_FACES = [
|
|
|
"obverse",
|
|
|
"reverse",
|
|
|
"left",
|
|
|
"right",
|
|
|
"top",
|
|
|
"bottom",
|
|
|
]
|
|
|
|
|
|
FACE_REMAPPING = {
|
|
|
"surface a": "obverse",
|
|
|
"surface b": "reverse",
|
|
|
}
|
|
|
|
|
|
|
|
|
SPECIAL_TOKENS = [
|
|
|
"<B>",
|
|
|
"<M>",
|
|
|
"<S>",
|
|
|
"<D>",
|
|
|
"<munus>",
|
|
|
"<ansze>",
|
|
|
"<ki>",
|
|
|
"<disz>",
|
|
|
"x",
|
|
|
]
|
|
|
|
|
|
def __init__(self, token_path: str = "./data/cuneiform_vocab.tsv"):
|
|
|
"""
|
|
|
Initialize the ATF converter.
|
|
|
|
|
|
Args:
|
|
|
token_path: Path to the cuneiform vocabulary file
|
|
|
"""
|
|
|
self.text2sign = self._load_token_mapping(token_path)
|
|
|
|
|
|
|
|
|
self.vocab_freq = Counter()
|
|
|
self.new_tokens = Counter()
|
|
|
self.langs = Counter()
|
|
|
self.unknown_faces = Counter()
|
|
|
|
|
|
def _load_token_mapping(self, token_path: str) -> tuple[dict, dict]:
|
|
|
"""Load the text to sign and sign to text mappings."""
|
|
|
|
|
|
text2sign = {}
|
|
|
for t in open(token_path).readlines():
|
|
|
try:
|
|
|
k, s = t.strip("\n").split("\t")
|
|
|
except:
|
|
|
print(t)
|
|
|
continue
|
|
|
text2sign[k] = s.replace(" ", "")
|
|
|
|
|
|
return text2sign
|
|
|
|
|
|
def _remove_at(self, x: str) -> Optional[str]:
|
|
|
"""Remove @c or @t suffixes from tokens."""
|
|
|
if x.endswith("@c)") or x.endswith("@t)"):
|
|
|
return x[:-3] + ")"
|
|
|
return None
|
|
|
|
|
|
def _remove_spaces(self, x: list[str]) -> list[str]:
|
|
|
"""Remove consecutive space tokens."""
|
|
|
new_x = []
|
|
|
for item in x:
|
|
|
if item == "<S>" and len(new_x) > 0 and new_x[-1] == "<S>":
|
|
|
continue
|
|
|
new_x.append(item)
|
|
|
return new_x
|
|
|
|
|
|
def parse(self, raw_text: str) -> Optional[ParsedATF]:
|
|
|
"""
|
|
|
Parse ATF text and extract transliterations and unicode.
|
|
|
|
|
|
Args:
|
|
|
raw_text: The raw ATF text to parse
|
|
|
|
|
|
Returns:
|
|
|
ParsedATF object if parsing succeeded, None if the language is not supported
|
|
|
"""
|
|
|
token_text = {"default": []}
|
|
|
info = {}
|
|
|
|
|
|
curr_face = "default"
|
|
|
sep = "\n"
|
|
|
if "\\n" in raw_text:
|
|
|
sep = "\\n"
|
|
|
|
|
|
for line in raw_text.split(sep):
|
|
|
line = line.strip()
|
|
|
|
|
|
if line.startswith("&") or line.startswith("'&"):
|
|
|
|
|
|
pass
|
|
|
elif line.startswith("#atf"):
|
|
|
info["lang"] = line.split("lang ")[-1].strip()
|
|
|
self.langs[info["lang"]] += 1
|
|
|
if info["lang"] not in ["sux", "akk", "sux, akk", "akk _sux"]:
|
|
|
|
|
|
return None
|
|
|
elif (
|
|
|
line.startswith("#")
|
|
|
or line.startswith(">>")
|
|
|
or line.startswith("<<")
|
|
|
or line.startswith("||")
|
|
|
):
|
|
|
|
|
|
continue
|
|
|
elif line.startswith("$"):
|
|
|
if "broken" in line:
|
|
|
try:
|
|
|
token_text[curr_face].append("<B>")
|
|
|
except:
|
|
|
continue
|
|
|
elif line.startswith("@"):
|
|
|
key = line[1:].strip().strip("?")
|
|
|
if key in self.ALL_FACES:
|
|
|
curr_face = key
|
|
|
token_text[key] = []
|
|
|
elif key.startswith("column"):
|
|
|
token_text[curr_face].append("<COL>")
|
|
|
else:
|
|
|
self.unknown_faces[key] += 1
|
|
|
else:
|
|
|
|
|
|
self._process_line_content(line, curr_face, token_text)
|
|
|
|
|
|
|
|
|
transliterations, unicodes, used_signs = self._build_outputs(token_text)
|
|
|
return ParsedATF(transliterations, unicodes, info, used_signs)
|
|
|
|
|
|
def _process_line_content(self, line: str, curr_face: str, token_text: dict):
|
|
|
"""Process a content line and extract tokens."""
|
|
|
|
|
|
line = line.replace("{d}", "<D>")
|
|
|
|
|
|
for x in re.findall(r"\{.*?\}", line):
|
|
|
line = line.replace(x, " " + x[1:-1] + " ")
|
|
|
|
|
|
line = line.replace("($ blank space $)", "<S>")
|
|
|
|
|
|
|
|
|
line = line.replace("_", " ")
|
|
|
|
|
|
|
|
|
line = line.replace("#", "")
|
|
|
|
|
|
|
|
|
line = line.replace("?", "")
|
|
|
line = line.replace("!", "")
|
|
|
|
|
|
|
|
|
for x in re.findall(r"\[.*?\]", line):
|
|
|
line = line.replace(x, "")
|
|
|
|
|
|
line = line.split(". ")
|
|
|
|
|
|
if len(line) >= 2:
|
|
|
|
|
|
if len(line) > 2:
|
|
|
line = line[0], ". ".join(line[1:])
|
|
|
|
|
|
line_num, text = line
|
|
|
if curr_face != "":
|
|
|
tokens = text.split(" ")
|
|
|
signs = []
|
|
|
for i, t in enumerate(tokens):
|
|
|
|
|
|
|
|
|
|
|
|
if "-" in t:
|
|
|
ts = t.split("-")
|
|
|
for x in ts:
|
|
|
x = x.strip()
|
|
|
if len(x) == 0:
|
|
|
continue
|
|
|
if x in self.text2sign:
|
|
|
self.vocab_freq[x] += 1
|
|
|
signs.append(self.text2sign[x])
|
|
|
else:
|
|
|
new_x = self._remove_at(x)
|
|
|
if new_x and new_x in self.text2sign:
|
|
|
signs.append(self.text2sign[new_x])
|
|
|
else:
|
|
|
self.new_tokens[x] += 1
|
|
|
elif t in self.text2sign:
|
|
|
signs.append(self.text2sign[t])
|
|
|
elif t in self.SPECIAL_TOKENS:
|
|
|
self.vocab_freq[t] += 1
|
|
|
signs.append(t)
|
|
|
else:
|
|
|
new_x = self._remove_at(t)
|
|
|
if new_x and new_x in self.text2sign:
|
|
|
signs.append(self.text2sign[new_x])
|
|
|
else:
|
|
|
if len(t.strip()) > 0:
|
|
|
self.new_tokens[t] += 1
|
|
|
|
|
|
signs = self._remove_spaces(signs)
|
|
|
token_text[curr_face].append(
|
|
|
{"raw": text, "num": line_num, "sign": signs}
|
|
|
)
|
|
|
|
|
|
def _build_outputs(
|
|
|
self, token_text: dict
|
|
|
) -> tuple[dict[str, list[list[str]]], dict[str, list[list[str]]], set[str]]:
|
|
|
"""Build transliterations and unicode outputs from parsed token_text."""
|
|
|
transliterations = {}
|
|
|
unicodes = {}
|
|
|
used_signs = set()
|
|
|
|
|
|
for face in token_text.keys():
|
|
|
lines = token_text[face]
|
|
|
face_key = self.FACE_REMAPPING.get(face, face)
|
|
|
|
|
|
|
|
|
face_transliterations: list[list[str]] = []
|
|
|
face_unicodes: list[list[str]] = []
|
|
|
|
|
|
current_column = {"transliteration": [], "unicode": []}
|
|
|
|
|
|
for line in lines:
|
|
|
if line == "<COL>":
|
|
|
if len(current_column["transliteration"]) > 0:
|
|
|
face_transliterations.append(current_column["transliteration"])
|
|
|
if len(current_column["unicode"]) > 0:
|
|
|
face_unicodes.append(current_column["unicode"])
|
|
|
current_column = {"transliteration": [], "unicode": []}
|
|
|
continue
|
|
|
|
|
|
if type(line) == str:
|
|
|
continue
|
|
|
|
|
|
used_signs.update(line.get("sign", ["<B>"]))
|
|
|
|
|
|
current_column["transliteration"].append(line.get("raw", "<B>"))
|
|
|
current_column["unicode"].append(" ".join(line.get("sign", ["<B>"])))
|
|
|
|
|
|
if len(current_column["transliteration"]) > 0:
|
|
|
face_transliterations.append(current_column["transliteration"])
|
|
|
if len(current_column["unicode"]) > 0:
|
|
|
face_unicodes.append(current_column["unicode"])
|
|
|
|
|
|
if len(face_transliterations) == 1:
|
|
|
|
|
|
transliterations[face_key] = "\n".join(face_transliterations[0])
|
|
|
else:
|
|
|
transliterations[face_key] = "\n".join(
|
|
|
[
|
|
|
f"@column {i+1}\n" + "\n".join(column)
|
|
|
for i, column in enumerate(face_transliterations)
|
|
|
]
|
|
|
)
|
|
|
|
|
|
if len(face_unicodes) == 1:
|
|
|
|
|
|
unicodes[face_key] = "\n".join(face_unicodes[0])
|
|
|
else:
|
|
|
unicodes[face_key] = "\n".join(
|
|
|
[
|
|
|
f"@column {i+1}\n" + "\n".join(column)
|
|
|
for i, column in enumerate(face_unicodes)
|
|
|
]
|
|
|
)
|
|
|
|
|
|
return transliterations, unicodes, used_signs
|
|
|
|