import re from collections import Counter from typing import Optional class ParsedATF: """Represents a parsed ATF document with methods to extract data.""" # Face types ALL_FACES = [ "obverse", "reverse", "left", "right", "top", "bottom", ] def __init__( self, transliterations: dict, unicodes: dict, info: dict, used_signs: set ): """ Initialize parsed ATF data. Args: transliterations: Dictionary mapping face names to transliteration line lists unicodes: Dictionary mapping face names to unicode line lists info: Metadata dictionary (e.g., language) """ self._transliterations = transliterations self._unicodes = unicodes self._info = info self._used_signs = used_signs def get_used_signs(self) -> set[str]: """Get the set of used signs.""" return self._used_signs def get_transliteration(self, face: str) -> Optional[str]: """ Get the transliteration for a given face. Args: face: The face name (e.g., 'obverse', 'reverse') Returns: The transliteration as a string with lines separated by newlines, or None if the face has no content """ if face in self._transliterations: return self._transliterations[face] return None def get_unicode(self, face: str) -> Optional[str]: """ Get the unicode representation for a given face. Args: face: The face name (e.g., 'obverse', 'reverse') Returns: The unicode representation as a string with lines separated by newlines, or None if the face has no content """ if face in self._unicodes: return self._unicodes[face] return None def get_all_unicodes(self) -> dict[str, Optional[str]]: """ Get unicode for all faces. Returns: Dictionary mapping face names to unicode strings """ return { f"{face}_unicode": self.get_unicode(face) for face in self.ALL_FACES if self.get_unicode(face) is not None } def get_all_transliterations(self) -> dict[str, Optional[str]]: """ Get transliteration for all faces. Returns: Dictionary mapping face names to transliteration strings """ return { f"{face}_transliteration": self.get_transliteration(face) for face in self.ALL_FACES if self.get_transliteration(face) is not None } @property def info(self) -> dict: """Get parsing info (e.g., language).""" return self._info class ATFConverter: """Converter for ATF (ASCII Transliteration Format) cuneiform text.""" # Face types ALL_FACES = [ "obverse", "reverse", "left", "right", "top", "bottom", ] FACE_REMAPPING = { "surface a": "obverse", "surface b": "reverse", } # Special tokens SPECIAL_TOKENS = [ "", # broken "", # missing one or more token? "", # blank space "", # divine "", # young woman, or woman "", "", "", "x", # unknown signs ] def __init__(self, token_path: str = "./data/cuneiform_vocab.tsv"): """ Initialize the ATF converter. Args: token_path: Path to the cuneiform vocabulary file """ self.text2sign = self._load_token_mapping(token_path) # Counters for statistics self.vocab_freq = Counter() self.new_tokens = Counter() self.langs = Counter() self.unknown_faces = Counter() def _load_token_mapping(self, token_path: str) -> tuple[dict, dict]: """Load the text to sign and sign to text mappings.""" text2sign = {} for t in open(token_path).readlines(): try: k, s = t.strip("\n").split("\t") except: print(t) continue text2sign[k] = s.replace(" ", "") return text2sign def _remove_at(self, x: str) -> Optional[str]: """Remove @c or @t suffixes from tokens.""" if x.endswith("@c)") or x.endswith("@t)"): return x[:-3] + ")" return None def _remove_spaces(self, x: list[str]) -> list[str]: """Remove consecutive space tokens.""" new_x = [] for item in x: if item == "" and len(new_x) > 0 and new_x[-1] == "": continue new_x.append(item) return new_x def parse(self, raw_text: str) -> Optional[ParsedATF]: """ Parse ATF text and extract transliterations and unicode. Args: raw_text: The raw ATF text to parse Returns: ParsedATF object if parsing succeeded, None if the language is not supported """ token_text = {"default": []} info = {} curr_face = "default" sep = "\n" if "\\n" in raw_text: sep = "\\n" for line in raw_text.split(sep): line = line.strip() if line.startswith("&") or line.startswith("'&"): # metadata pass elif line.startswith("#atf"): info["lang"] = line.split("lang ")[-1].strip() self.langs[info["lang"]] += 1 if info["lang"] not in ["sux", "akk", "sux, akk", "akk _sux"]: # do not process those not sux or akk return None elif ( line.startswith("#") or line.startswith(">>") or line.startswith("<<") or line.startswith("||") ): # comment/link continue elif line.startswith("$"): if "broken" in line: try: token_text[curr_face].append("") except: continue elif line.startswith("@"): key = line[1:].strip().strip("?") if key in self.ALL_FACES: curr_face = key token_text[key] = [] elif key.startswith("column"): token_text[curr_face].append("") else: self.unknown_faces[key] += 1 else: # Process line content self._process_line_content(line, curr_face, token_text) # Build transliterations and unicodes from token_text transliterations, unicodes, used_signs = self._build_outputs(token_text) return ParsedATF(transliterations, unicodes, info, used_signs) def _process_line_content(self, line: str, curr_face: str, token_text: dict): """Process a content line and extract tokens.""" # Special symbols line = line.replace("{d}", "") for x in re.findall(r"\{.*?\}", line): line = line.replace(x, " " + x[1:-1] + " ") line = line.replace("($ blank space $)", "") # Remove underscore line = line.replace("_", " ") # Remove ending hash # line = line.replace("#", "") # Remove question mark, exclamation mark line = line.replace("?", "") line = line.replace("!", "") # Remove [] and () for x in re.findall(r"\[.*?\]", line): line = line.replace(x, "") line = line.split(". ") if len(line) >= 2: # Make sure only leading line number is split if len(line) > 2: line = line[0], ". ".join(line[1:]) line_num, text = line if curr_face != "": tokens = text.split(" ") signs = [] for i, t in enumerate(tokens): # if i > 0 and len(signs) > 0: # signs.append("") # insert a space between words if "-" in t: ts = t.split("-") for x in ts: x = x.strip() if len(x) == 0: continue if x in self.text2sign: self.vocab_freq[x] += 1 signs.append(self.text2sign[x]) else: new_x = self._remove_at(x) if new_x and new_x in self.text2sign: signs.append(self.text2sign[new_x]) else: self.new_tokens[x] += 1 elif t in self.text2sign: signs.append(self.text2sign[t]) elif t in self.SPECIAL_TOKENS: self.vocab_freq[t] += 1 signs.append(t) else: new_x = self._remove_at(t) if new_x and new_x in self.text2sign: signs.append(self.text2sign[new_x]) else: if len(t.strip()) > 0: self.new_tokens[t] += 1 signs = self._remove_spaces(signs) token_text[curr_face].append( {"raw": text, "num": line_num, "sign": signs} ) def _build_outputs( self, token_text: dict ) -> tuple[dict[str, list[list[str]]], dict[str, list[list[str]]], set[str]]: """Build transliterations and unicode outputs from parsed token_text.""" transliterations = {} unicodes = {} used_signs = set() for face in token_text.keys(): lines = token_text[face] face_key = self.FACE_REMAPPING.get(face, face) # List of columns, each column is a list of lines face_transliterations: list[list[str]] = [] face_unicodes: list[list[str]] = [] current_column = {"transliteration": [], "unicode": []} for line in lines: if line == "": if len(current_column["transliteration"]) > 0: face_transliterations.append(current_column["transliteration"]) if len(current_column["unicode"]) > 0: face_unicodes.append(current_column["unicode"]) current_column = {"transliteration": [], "unicode": []} continue if type(line) == str: continue used_signs.update(line.get("sign", [""])) current_column["transliteration"].append(line.get("raw", "")) current_column["unicode"].append(" ".join(line.get("sign", [""]))) if len(current_column["transliteration"]) > 0: face_transliterations.append(current_column["transliteration"]) if len(current_column["unicode"]) > 0: face_unicodes.append(current_column["unicode"]) if len(face_transliterations) == 1: # No need for column markers as there is only one column transliterations[face_key] = "\n".join(face_transliterations[0]) else: transliterations[face_key] = "\n".join( [ f"@column {i+1}\n" + "\n".join(column) for i, column in enumerate(face_transliterations) ] ) if len(face_unicodes) == 1: # No need for column markers as there is only one column unicodes[face_key] = "\n".join(face_unicodes[0]) else: unicodes[face_key] = "\n".join( [ f"@column {i+1}\n" + "\n".join(column) for i, column in enumerate(face_unicodes) ] ) return transliterations, unicodes, used_signs