Upload training code

Browse files

Files changed (5) hide show

training/convert_atf.py +371 -0
training/cuneiform_ocr_eval.ipynb +254 -0
training/cuneiform_ocr_grpo.ipynb +0 -0
training/cuneiform_ocr_sft.ipynb +397 -0
training/get_cdli_dataset.py +279 -0

training/convert_atf.py ADDED Viewed

	@@ -0,0 +1,371 @@

+import re
+from collections import Counter
+from typing import Optional
+class ParsedATF:
+    """Represents a parsed ATF document with methods to extract data."""
+    # Face types
+    ALL_FACES = [
+        "obverse",
+        "reverse",
+        "left",
+        "right",
+        "top",
+        "bottom",
+    ]
+    def __init__(
+        self, transliterations: dict, unicodes: dict, info: dict, used_signs: set
+    ):
+        """
+        Initialize parsed ATF data.
+        Args:
+            transliterations: Dictionary mapping face names to transliteration line lists
+            unicodes: Dictionary mapping face names to unicode line lists
+            info: Metadata dictionary (e.g., language)
+        """
+        self._transliterations = transliterations
+        self._unicodes = unicodes
+        self._info = info
+        self._used_signs = used_signs
+    def get_used_signs(self) -> set[str]:
+        """Get the set of used signs."""
+        return self._used_signs
+    def get_transliteration(self, face: str) -> Optional[str]:
+        """
+        Get the transliteration for a given face.
+        Args:
+            face: The face name (e.g., 'obverse', 'reverse')
+        Returns:
+            The transliteration as a string with lines separated by newlines,
+            or None if the face has no content
+        """
+        if face in self._transliterations:
+            return self._transliterations[face]
+        return None
+    def get_unicode(self, face: str) -> Optional[str]:
+        """
+        Get the unicode representation for a given face.
+        Args:
+            face: The face name (e.g., 'obverse', 'reverse')
+        Returns:
+            The unicode representation as a string with lines separated by newlines,
+            or None if the face has no content
+        """
+        if face in self._unicodes:
+            return self._unicodes[face]
+        return None
+    def get_all_unicodes(self) -> dict[str, Optional[str]]:
+        """
+        Get unicode for all faces.
+        Returns:
+            Dictionary mapping face names to unicode strings
+        """
+        return {
+            f"{face}_unicode": self.get_unicode(face)
+            for face in self.ALL_FACES
+            if self.get_unicode(face) is not None
+        }
+    def get_all_transliterations(self) -> dict[str, Optional[str]]:
+        """
+        Get transliteration for all faces.
+        Returns:
+            Dictionary mapping face names to transliteration strings
+        """
+        return {
+            f"{face}_transliteration": self.get_transliteration(face)
+            for face in self.ALL_FACES
+            if self.get_transliteration(face) is not None
+        }
+    @property
+    def info(self) -> dict:
+        """Get parsing info (e.g., language)."""
+        return self._info
+class ATFConverter:
+    """Converter for ATF (ASCII Transliteration Format) cuneiform text."""
+    # Face types
+    ALL_FACES = [
+        "obverse",
+        "reverse",
+        "left",
+        "right",
+        "top",
+        "bottom",
+    ]
+    FACE_REMAPPING = {
+        "surface a": "obverse",
+        "surface b": "reverse",
+    }
+    # Special tokens
+    SPECIAL_TOKENS = [
+        "<B>",  # broken
+        "<M>",  # missing one or more token?
+        "<S>",  # blank space
+        "<D>",  # divine
+        "<munus>",  # young woman, or woman
+        "<ansze>",
+        "<ki>",
+        "<disz>",
+        "x",  # unknown signs
+    ]
+    def __init__(self, token_path: str = "./data/cuneiform_vocab.tsv"):
+        """
+        Initialize the ATF converter.
+        Args:
+            token_path: Path to the cuneiform vocabulary file
+        """
+        self.text2sign = self._load_token_mapping(token_path)
+        # Counters for statistics
+        self.vocab_freq = Counter()
+        self.new_tokens = Counter()
+        self.langs = Counter()
+        self.unknown_faces = Counter()
+    def _load_token_mapping(self, token_path: str) -> tuple[dict, dict]:
+        """Load the text to sign and sign to text mappings."""
+        text2sign = {}
+        for t in open(token_path).readlines():
+            try:
+                k, s = t.strip("\n").split("\t")
+            except:
+                print(t)
+                continue
+            text2sign[k] = s.replace(" ", "")
+        return text2sign
+    def _remove_at(self, x: str) -> Optional[str]:
+        """Remove @c or @t suffixes from tokens."""
+        if x.endswith("@c)") or x.endswith("@t)"):
+            return x[:-3] + ")"
+        return None
+    def _remove_spaces(self, x: list[str]) -> list[str]:
+        """Remove consecutive space tokens."""
+        new_x = []
+        for item in x:
+            if item == "<S>" and len(new_x) > 0 and new_x[-1] == "<S>":
+                continue
+            new_x.append(item)
+        return new_x
+    def parse(self, raw_text: str) -> Optional[ParsedATF]:
+        """
+        Parse ATF text and extract transliterations and unicode.
+        Args:
+            raw_text: The raw ATF text to parse
+        Returns:
+            ParsedATF object if parsing succeeded, None if the language is not supported
+        """
+        token_text = {"default": []}
+        info = {}
+        curr_face = "default"
+        sep = "\n"
+        if "\\n" in raw_text:
+            sep = "\\n"
+        for line in raw_text.split(sep):
+            line = line.strip()
+            if line.startswith("&") or line.startswith("'&"):
+                # metadata
+                pass
+            elif line.startswith("#atf"):
+                info["lang"] = line.split("lang ")[-1].strip()
+                self.langs[info["lang"]] += 1
+                if info["lang"] not in ["sux", "akk", "sux, akk", "akk _sux"]:
+                    # do not process those not sux or akk
+                    return None
+            elif (
+                line.startswith("#")
+                or line.startswith(">>")
+                or line.startswith("<<")
+                or line.startswith("||")
+            ):
+                # comment/link
+                continue
+            elif line.startswith("$"):
+                if "broken" in line:
+                    try:
+                        token_text[curr_face].append("<B>")
+                    except:
+                        continue
+            elif line.startswith("@"):
+                key = line[1:].strip().strip("?")
+                if key in self.ALL_FACES:
+                    curr_face = key
+                    token_text[key] = []
+                elif key.startswith("column"):
+                    token_text[curr_face].append("<COL>")
+                else:
+                    self.unknown_faces[key] += 1
+            else:
+                # Process line content
+                self._process_line_content(line, curr_face, token_text)
+        # Build transliterations and unicodes from token_text
+        transliterations, unicodes, used_signs = self._build_outputs(token_text)
+        return ParsedATF(transliterations, unicodes, info, used_signs)
+    def _process_line_content(self, line: str, curr_face: str, token_text: dict):
+        """Process a content line and extract tokens."""
+        # Special symbols
+        line = line.replace("{d}", "<D>")
+        for x in re.findall(r"\{.*?\}", line):
+            line = line.replace(x, " " + x[1:-1] + " ")
+        line = line.replace("($ blank space $)", "<S>")
+        # Remove underscore
+        line = line.replace("_", " ")
+        # Remove ending hash #
+        line = line.replace("#", "")
+        # Remove question mark, exclamation mark
+        line = line.replace("?", "")
+        line = line.replace("!", "")
+        # Remove [] and ()
+        for x in re.findall(r"\[.*?\]", line):
+            line = line.replace(x, "")
+        line = line.split(". ")
+        if len(line) >= 2:
+            # Make sure only leading line number is split
+            if len(line) > 2:
+                line = line[0], ". ".join(line[1:])
+            line_num, text = line
+            if curr_face != "":
+                tokens = text.split(" ")
+                signs = []
+                for i, t in enumerate(tokens):
+                    #     if i > 0 and len(signs) > 0:
+                    #         signs.append("<S>")  # insert a space between words
+                    if "-" in t:
+                        ts = t.split("-")
+                        for x in ts:
+                            x = x.strip()
+                            if len(x) == 0:
+                                continue
+                            if x in self.text2sign:
+                                self.vocab_freq[x] += 1
+                                signs.append(self.text2sign[x])
+                            else:
+                                new_x = self._remove_at(x)
+                                if new_x and new_x in self.text2sign:
+                                    signs.append(self.text2sign[new_x])
+                                else:
+                                    self.new_tokens[x] += 1
+                    elif t in self.text2sign:
+                        signs.append(self.text2sign[t])
+                    elif t in self.SPECIAL_TOKENS:
+                        self.vocab_freq[t] += 1
+                        signs.append(t)
+                    else:
+                        new_x = self._remove_at(t)
+                        if new_x and new_x in self.text2sign:
+                            signs.append(self.text2sign[new_x])
+                        else:
+                            if len(t.strip()) > 0:
+                                self.new_tokens[t] += 1
+                signs = self._remove_spaces(signs)
+                token_text[curr_face].append(
+                    {"raw": text, "num": line_num, "sign": signs}
+                )
+    def _build_outputs(
+        self, token_text: dict
+    ) -> tuple[dict[str, list[list[str]]], dict[str, list[list[str]]], set[str]]:
+        """Build transliterations and unicode outputs from parsed token_text."""
+        transliterations = {}
+        unicodes = {}
+        used_signs = set()
+        for face in token_text.keys():
+            lines = token_text[face]
+            face_key = self.FACE_REMAPPING.get(face, face)
+            # List of columns, each column is a list of lines
+            face_transliterations: list[list[str]] = []
+            face_unicodes: list[list[str]] = []
+            current_column = {"transliteration": [], "unicode": []}
+            for line in lines:
+                if line == "<COL>":
+                    if len(current_column["transliteration"]) > 0:
+                        face_transliterations.append(current_column["transliteration"])
+                    if len(current_column["unicode"]) > 0:
+                        face_unicodes.append(current_column["unicode"])
+                    current_column = {"transliteration": [], "unicode": []}
+                    continue
+                if type(line) == str:
+                    continue
+                used_signs.update(line.get("sign", ["<B>"]))
+                current_column["transliteration"].append(line.get("raw", "<B>"))
+                current_column["unicode"].append(" ".join(line.get("sign", ["<B>"])))
+            if len(current_column["transliteration"]) > 0:
+                face_transliterations.append(current_column["transliteration"])
+            if len(current_column["unicode"]) > 0:
+                face_unicodes.append(current_column["unicode"])
+            if len(face_transliterations) == 1:
+                # No need for column markers as there is only one column
+                transliterations[face_key] = "\n".join(face_transliterations[0])
+            else:
+                transliterations[face_key] = "\n".join(
+                    [
+                        f"@column {i+1}\n" + "\n".join(column)
+                        for i, column in enumerate(face_transliterations)
+                    ]
+                )
+            if len(face_unicodes) == 1:
+                # No need for column markers as there is only one column
+                unicodes[face_key] = "\n".join(face_unicodes[0])
+            else:
+                unicodes[face_key] = "\n".join(
+                    [
+                        f"@column {i+1}\n" + "\n".join(column)
+                        for i, column in enumerate(face_unicodes)
+                    ]
+                )
+        return transliterations, unicodes, used_signs

training/cuneiform_ocr_eval.ipynb ADDED Viewed

	@@ -0,0 +1,254 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4ca0fb0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from PIL import Image\n",
+    "from tqdm.auto import tqdm\n",
+    "from transformers import AutoModelForCausalLM, AutoProcessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a961375e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load dataset\n",
+    "from get_cdli_dataset import get_dataset, IMG_CACHE\n",
+    "\n",
+    "dataset = get_dataset()\n",
+    "test_dataset = dataset[\"test\"]\n",
+    "\n",
+    "print(test_dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e226c45c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load the model\n",
+    "\n",
+    "# model_path = \"PaddlePaddle/PaddleOCR-VL\"  # base\n",
+    "# model_path = \"./outputs/sft\"\n",
+    "model_path = \"../\"\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_path, trust_remote_code=True, torch_dtype=torch.bfloat16\n",
+    ").to(\"cuda\").eval()\n",
+    "processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97b9a2cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pyxdameraulevenshtein as dl\n",
+    "\n",
+    "def compute_ter(expected_ids: list[int], predicted_ids: list[int]) -> float:\n",
+    "    \"\"\"\n",
+    "    Compute Token Error Rate (TER) between ground truth and completion tokens.\n",
+    "    TER = (substitutions + deletions + insertions) / len(ground_truth)\n",
+    "\n",
+    "    TER is better than CER for cuneiform OCR as:\n",
+    "    - Multi-character Unicode signs count as 1 token instead of multiple chars\n",
+    "    - Special tokens like @obverse/@reverse count as 1 token\n",
+    "    \"\"\"\n",
+    "\n",
+    "    if len(expected_ids) == 0:\n",
+    "        return 0.0 if len(predicted_ids) == 0 else 1.0\n",
+    "\n",
+    "    # Calculate edit distance on token sequences\n",
+    "    distance = dl.damerau_levenshtein_distance(expected_ids, predicted_ids)\n",
+    "\n",
+    "    # TER is the edit distance normalized by the truth token count\n",
+    "    ter = distance / max(1, len(expected_ids))\n",
+    "\n",
+    "    return ter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "859c4fc2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run inference on all test examples\n",
+    "results = []\n",
+    "total_ter = 0.0\n",
+    "\n",
+    "pbar = tqdm(test_dataset, desc=\"Evaluating on test set\")\n",
+    "\n",
+    "for idx, example in enumerate(pbar):\n",
+    "    expected = example[\"unicode\"]\n",
+    "    expected_ids = processor.tokenizer.encode(expected, add_special_tokens = False)\n",
+    "\n",
+    "    # Load image\n",
+    "    with Image.open(IMG_CACHE / f\"P{str(example['id']).rjust(6, '0')}.jpg\").convert(\n",
+    "        \"RGB\"\n",
+    "    ) as image:\n",
+    "        # Prepare input\n",
+    "        messages = [\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": [\n",
+    "                    {\"type\": \"image\", \"image\": image},\n",
+    "                    {\"type\": \"text\", \"text\": \"OCR:\"},\n",
+    "                ],\n",
+    "            },\n",
+    "        ]\n",
+    "\n",
+    "        inputs = processor.apply_chat_template(\n",
+    "            messages, \n",
+    "            tokenize=True, \n",
+    "            add_generation_prompt=True, \t\n",
+    "            return_dict=True,\n",
+    "            return_tensors=\"pt\"\n",
+    "        ).to(\"cuda\")\n",
+    "\n",
+    "    # Generate prediction\n",
+    "    with torch.no_grad():\n",
+    "        output_ids = model.generate(\n",
+    "            **inputs,\n",
+    "            use_cache=True,\n",
+    "            max_new_tokens=int(len(expected_ids) * 1.2),\n",
+    "            repetition_penalty=1.03,\n",
+    "        )\n",
+    "\n",
+    "    predicted_ids = output_ids[0][inputs[\"input_ids\"].shape[1] :][:-1].tolist()\n",
+    "\n",
+    "    # Compute TER for this example\n",
+    "    ter = compute_ter(expected_ids, predicted_ids)\n",
+    "    total_ter += ter\n",
+    "\n",
+    "    pbar.set_postfix_str(f\"AVG TER={total_ter / (idx+1):.3f}\")\n",
+    "\n",
+    "    prediction = processor.decode(\n",
+    "        predicted_ids,\n",
+    "        skip_special_tokens=False,\n",
+    "    ).strip()\n",
+    "\n",
+    "    # Store results\n",
+    "    results.append(\n",
+    "        {\n",
+    "            \"id\": example[\"id\"],\n",
+    "            \"expected\": expected,\n",
+    "            \"prediction\": prediction,\n",
+    "            \"ter\": ter,\n",
+    "        }\n",
+    "    )\n",
+    "    tqdm.write(f\"\\033[94m\\nID: {example['id']} | TER: {ter:.4f}\\033[0m\")\n",
+    "    tqdm.write(f\"\\033[92mExpected:\\033[0m\\n{expected}\")\n",
+    "    tqdm.write(f\"\\033[91mPredicted:\\033[0m\\n{prediction}\")\n",
+    "\n",
+    "# Compute averages\n",
+    "average_ter = total_ter / len(test_dataset)\n",
+    "print(f\"\\n{'='*60}\")\n",
+    "print(f\"Average Token Error Rate (TER):     {average_ter:.4f} ({average_ter*100:.2f}%)\")\n",
+    "print(f\"{'='*60}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c6a8e02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Show examples: best and worst predictions (sorted by TER)\n",
+    "sorted_results = sorted(results, key=lambda x: x[\"ter\"])\n",
+    "\n",
+    "print(\"=\"*60)\n",
+    "print(\"BEST PREDICTIONS (Lowest TER)\")\n",
+    "print(\"=\"*60)\n",
+    "for i in range(min(10, len(sorted_results))):\n",
+    "    r = sorted_results[i]\n",
+    "    print(f\"\\nExample {i+1} - ID: {r['id']} - TER: {r['ter']:.4f}\")\n",
+    "    print(f\"Expected:\\n{r['expected']}\")\n",
+    "    print(f\"Predicted:\\n{r['prediction']}\")\n",
+    "    print(\"-\"*60)\n",
+    "\n",
+    "print(\"\\n\" + \"=\"*60)\n",
+    "print(\"WORST PREDICTIONS (Highest TER)\")\n",
+    "print(\"=\"*60)\n",
+    "for i in range(min(10, len(sorted_results))):\n",
+    "    r = sorted_results[-(i+1)]\n",
+    "    print(f\"\\nExample {i+1} - ID: {r['id']} - TER: {r['ter']:.4f}\")\n",
+    "    print(f\"Expected:\\n{r['expected']}\")\n",
+    "    print(f\"Predicted:\\n{r['prediction']}\")\n",
+    "    print(\"-\"*60)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5ceae30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TER and CER distribution statistics\n",
+    "import numpy as np\n",
+    "\n",
+    "ter_values = [r[\"ter\"] for r in results]\n",
+    "\n",
+    "print(\"=\"*60)\n",
+    "print(\"TER (TOKEN ERROR RATE) DISTRIBUTION STATISTICS\")\n",
+    "print(\"=\"*60)\n",
+    "print(f\"Mean TER:     {np.mean(ter_values):.4f} ({np.mean(ter_values)*100:.2f}%)\")\n",
+    "print(f\"Median TER:   {np.median(ter_values):.4f} ({np.median(ter_values)*100:.2f}%)\")\n",
+    "print(f\"Std Dev:      {np.std(ter_values):.4f}\")\n",
+    "print(f\"Min TER:      {np.min(ter_values):.4f} ({np.min(ter_values)*100:.2f}%)\")\n",
+    "print(f\"Max TER:      {np.max(ter_values):.4f} ({np.max(ter_values)*100:.2f}%)\")\n",
+    "print(f\"\\nPercentiles:\")\n",
+    "print(f\"  25th:       {np.percentile(ter_values, 25):.4f}\")\n",
+    "print(f\"  50th:       {np.percentile(ter_values, 50):.4f}\")\n",
+    "print(f\"  75th:       {np.percentile(ter_values, 75):.4f}\")\n",
+    "print(f\"  90th:       {np.percentile(ter_values, 90):.4f}\")\n",
+    "print(f\"  95th:       {np.percentile(ter_values, 95):.4f}\")\n",
+    "print(f\"  98th:       {np.percentile(ter_values, 98):.4f}\")\n",
+    "\n",
+    "# Count perfect predictions\n",
+    "perfect_predictions = sum(1 for ter in ter_values if ter == 0.0)\n",
+    "print(f\"\\nPerfect predictions (TER=0%): {perfect_predictions}/{len(ter_values)} ({perfect_predictions/len(ter_values)*100:.2f}%)\")\n",
+    "\n",
+    "# Count predictions with TER < 0.5 (less than 50% error)\n",
+    "good_predictions = sum(1 for ter in ter_values if ter < 0.5)\n",
+    "print(f\"Good predictions (TER<50%): {good_predictions}/{len(ter_values)} ({good_predictions/len(ter_values)*100:.2f}%)\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

training/cuneiform_ocr_grpo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

training/cuneiform_ocr_sft.ipynb ADDED Viewed

	@@ -0,0 +1,397 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "fd2siqgrq6w",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# CRITICAL: This patch MUST run BEFORE importing unsloth!\n",
+        "# Fix Unsloth's gradient checkpointing for models with keyword-only forward arguments\n",
+        "\n",
+        "import sys\n",
+        "import torch\n",
+        "import os\n",
+        "\n",
+        "# Import unsloth_zoo.peft_utils first so it's in sys.modules\n",
+        "os.environ[\"UNSLOTH_IS_PRESENT\"] = \"1\"\n",
+        "import unsloth_zoo.peft_utils\n",
+        "\n",
+        "# Now patch the function before anything else imports it\n",
+        "def patched_requires_grad_pre_hook(module, input):\n",
+        "    \"\"\"Patched hook that handles empty input tuples gracefully\"\"\"\n",
+        "    type_input = type(input)\n",
+        "    if type_input is torch.Tensor:\n",
+        "        input.requires_grad_(True)\n",
+        "    elif type_input is tuple or type_input is list:\n",
+        "        if len(input) == 0:\n",
+        "            # Empty tuple = keyword-only args. This is fine, gradients flow through kwargs\n",
+        "            return\n",
+        "        if len(input) > 0 and torch.is_floating_point(input[0]):\n",
+        "            input[0].requires_grad_(True)\n",
+        "\n",
+        "# Get the original function\n",
+        "original_func = sys.modules['unsloth_zoo.peft_utils'].requires_grad_for_gradient_checkpointing\n",
+        "\n",
+        "# Create wrapper that uses our patched hook\n",
+        "def patched_requires_grad_for_gradient_checkpointing(model):\n",
+        "    \"\"\"Wrapper that calls original but uses patched hook\"\"\"\n",
+        "    import re\n",
+        "    import inspect\n",
+        "    \n",
+        "    # Define the other helper functions we need\n",
+        "    def requires_grad_post_hook(module, input, output):\n",
+        "        try:\n",
+        "            if hasattr(output, \"loss\") and output.loss is not None:\n",
+        "                output.loss.requires_grad_(True)\n",
+        "            elif hasattr(output, \"logits\") and output.logits is not None:\n",
+        "                output.logits.requires_grad_(True)\n",
+        "            elif type(output) is torch.Tensor:\n",
+        "                output.requires_grad_(True)\n",
+        "        except: pass\n",
+        "    \n",
+        "    def register_other_hooks(hook_name, hook_func_name, module, hooks_dict_name):\n",
+        "        if not hasattr(module, hooks_dict_name): return\n",
+        "        hooks_dict = getattr(module, hooks_dict_name)\n",
+        "        for hook_id, hook_fn in list(hooks_dict.items()):\n",
+        "            if hook_func_name in str(hook_fn):\n",
+        "                del hooks_dict[hook_id]\n",
+        "    \n",
+        "    # Find first parameter with requires_grad\n",
+        "    param = None\n",
+        "    for name, param in model.named_parameters():\n",
+        "        if param.requires_grad: break\n",
+        "    if param is None: return\n",
+        "    \n",
+        "    name = re.sub(r\"\\.([\\d]{1,})\\.\", r\"[\\1].\", name)\n",
+        "    name_components = name.split(\".\")\n",
+        "    if len(name_components) == 0:\n",
+        "        raise RuntimeError(\"Unsloth: Model has 0 layers?\")\n",
+        "    \n",
+        "    # Find the module to hook\n",
+        "    final_where = None\n",
+        "    for j in range(len(name_components)-1, 0, -1):\n",
+        "        name_curr = name_components[j]\n",
+        "        name_pre = \"model.\" + \".\".join(name_components[:j])\n",
+        "        if re.search(r\"\\[[\\d]{1,}\\]\", name_pre): continue\n",
+        "        module = eval(name_pre)\n",
+        "        if hasattr(module, \"forward\"):\n",
+        "            try: forward = inspect.getsource(module.forward)\n",
+        "            except: continue\n",
+        "            if f\"self.{name_curr}(\" in forward:\n",
+        "                final_where = j + 1\n",
+        "                break\n",
+        "            module_list = re.sub(r\"\\[[\\d]{1,}\\]\", \"\", name_curr)\n",
+        "            if f\"in self.{module_list}:\" in forward:\n",
+        "                final_where = j\n",
+        "                break\n",
+        "            elif re.search(r\"for [^\\s]{3,} in self\\.\" + module_list, forward):\n",
+        "                final_where = j\n",
+        "                break\n",
+        "    \n",
+        "    if final_where is None:\n",
+        "        for module_name, module in model.named_modules():\n",
+        "            if not hasattr(module, \"get_input_embeddings\"): break\n",
+        "        register_other_hooks(\"requires_grad_post_hook\", \"requires_grad_post_hook\", module, \"_forward_hooks\")\n",
+        "        module.register_forward_hook(requires_grad_post_hook)\n",
+        "        return\n",
+        "    \n",
+        "    module_name = \"model.\" + \".\".join(name_components[:final_where])\n",
+        "    module = eval(module_name)\n",
+        "    \n",
+        "    if hasattr(module, \"config\") and module.config.__class__.__name__ in (\"CLIPVisionConfig\", \"SiglipVisionConfig\"):\n",
+        "        old_module = model\n",
+        "        for module_name, module in model.named_modules():\n",
+        "            if not hasattr(module, \"get_input_embeddings\"): break\n",
+        "            old_module = module\n",
+        "        module = old_module\n",
+        "    \n",
+        "    print(f\"Unsloth: Making `{module_name}` require gradients\")\n",
+        "    \n",
+        "    # Try post-hook first\n",
+        "    if hasattr(module, \"get_input_embeddings\"):\n",
+        "        try:\n",
+        "            module = module.get_input_embeddings()\n",
+        "            register_other_hooks(\"requires_grad_post_hook\", \"requires_grad_post_hook\", module, \"_forward_hooks\")\n",
+        "            module.register_forward_hook(requires_grad_post_hook)\n",
+        "            return\n",
+        "        except: pass\n",
+        "    \n",
+        "    # Use our patched pre-hook\n",
+        "    register_other_hooks(\"requires_grad_pre_hook\", \"requires_grad_pre_hook\", module, \"_forward_pre_hooks\")\n",
+        "    module.register_forward_pre_hook(patched_requires_grad_pre_hook)\n",
+        "\n",
+        "# Replace in sys.modules\n",
+        "sys.modules['unsloth_zoo.peft_utils'].requires_grad_for_gradient_checkpointing = patched_requires_grad_for_gradient_checkpointing\n",
+        "\n",
+        "print(\"✓ Patched Unsloth gradient checkpointing BEFORE imports\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "c2c30bc6",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from unsloth import FastVisionModel\n",
+        "from unsloth.trainer import UnslothVisionDataCollator"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "4326b62e",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "from PIL import Image\n",
+        "from transformers import AutoModel, AutoProcessor\n",
+        "from trl import SFTTrainer, SFTConfig"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "d5e899ca",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from get_cdli_dataset import atf_converter, get_dataset, IMG_CACHE\n",
+        "\n",
+        "# Load dataset\n",
+        "dataset = get_dataset()\n",
+        "\n",
+        "train_dataset = dataset[\"train\"]\n",
+        "test_dataset = dataset[\"test\"]\n",
+        "\n",
+        "print(train_dataset, test_dataset)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "9e0aa56b",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Load processor and model\n",
+        "model, tokenizer = FastVisionModel.from_pretrained(\n",
+        "    \"PaddlePaddle/PaddleOCR-VL\",\n",
+        "    cache_dir = \"./hf_cache/models\",\n",
+        "    trust_remote_code = True,\n",
+        "    load_in_4bit = False,\n",
+        "    auto_model = AutoModel,\n",
+        "    full_finetuning=True,\n",
+        "    unsloth_force_compile = True,\n",
+        "    use_gradient_checkpointing = \"unsloth\",\n",
+        "    max_seq_length = 16000,\n",
+        ")\n",
+        "\n",
+        "processor = AutoProcessor.from_pretrained(\n",
+        "    \"PaddlePaddle/PaddleOCR-VL\",\n",
+        "    cache_dir=\"./hf_cache/models\",\n",
+        "    trust_remote_code=True,\n",
+        ")\n",
+        "\n",
+        "processor.tokenizer = tokenizer\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "28656983",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "used_signs = set()\n",
+        "for example in train_dataset:\n",
+        "    parsed = atf_converter.parse(example[\"atf\"])\n",
+        "    used_signs.update(parsed.get_used_signs())\n",
+        "for example in test_dataset:\n",
+        "    parsed = atf_converter.parse(example[\"atf\"])\n",
+        "    used_signs.update(parsed.get_used_signs())\n",
+        "\n",
+        "print(f\"Base model vocab size: {len(processor.tokenizer)}\")\n",
+        "\n",
+        "# Add the cuneiform to the model vocab\n",
+        "num_added_tokens = processor.tokenizer.add_tokens(list(used_signs))\n",
+        "num_added_special_tokens = processor.tokenizer.add_special_tokens(\n",
+        "    {\n",
+        "        \"additional_special_tokens\": [f\"@{face}\" for face in atf_converter.ALL_FACES]\n",
+        "        + atf_converter.SPECIAL_TOKENS\n",
+        "    },\n",
+        "    replace_additional_special_tokens=False,\n",
+        ")\n",
+        "\n",
+        "print(\n",
+        "    f\"Added {num_added_tokens} tokens and {num_added_special_tokens} special tokens to tokenizer\"\n",
+        ")\n",
+        "\n",
+        "# Assign the average to the new token embeddings\n",
+        "model.resize_token_embeddings(len(processor.tokenizer))\n",
+        "\n",
+        "print(f\"New model vocab size: {len(processor.tokenizer)}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "5100b97c",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Configure training\n",
+        "sft_training_args = SFTConfig(\n",
+        "    output_dir=\"./outputs/sft\",\n",
+        "    # max_steps=50,  # Remove for full run\n",
+        "    num_train_epochs=2,\n",
+        "    per_device_train_batch_size=2,\n",
+        "    per_device_eval_batch_size=2,\n",
+        "    gradient_accumulation_steps=1,\n",
+        "    learning_rate=2e-5,\n",
+        "    optim=\"adamw_8bit\",\n",
+        "    warmup_ratio=0.05,\n",
+        "    weight_decay=0.001,\n",
+        "    lr_scheduler_type=\"linear\",\n",
+        "    bf16=True,\n",
+        "    save_strategy=\"steps\",\n",
+        "    save_steps=200,\n",
+        "    eval_strategy=\"steps\",\n",
+        "    eval_steps=1000,\n",
+        "    logging_steps=1,\n",
+        "    report_to=\"none\",\n",
+        "    dataloader_num_workers=0,\n",
+        "    # You MUST put the below items for vision finetuning:\n",
+        "    remove_unused_columns=False,\n",
+        "    dataset_text_field=\"\",\n",
+        "    dataset_kwargs={\"skip_prepare_dataset\": True},\n",
+        "    max_length=16000,\n",
+        ")\n",
+        "\n",
+        "# Initialize trainer\n",
+        "sft_trainer = SFTTrainer(\n",
+        "    model=model,\n",
+        "    processing_class=processor,\n",
+        "    data_collator=UnslothVisionDataCollator(\n",
+        "        model,\n",
+        "        processor,\n",
+        "        train_on_responses_only=False,  # Fixed: was masking all tokens with True\n",
+        "        instruction_part=\"User: \",\n",
+        "        response_part=\"Assistant: \",\n",
+        "        pad_to_multiple_of=2,\n",
+        "        resize_dimension=\"max\",\n",
+        "        formatting_func=lambda example: {\n",
+        "            \"images\": [\n",
+        "                Image.open(IMG_CACHE / f\"P{str(example['id']).rjust(6, '0')}.jpg\")\n",
+        "            ],\n",
+        "            \"messages\": [\n",
+        "                # Add user message with image and task prompt\n",
+        "                {\n",
+        "                    \"role\": \"user\",\n",
+        "                    \"content\": [\n",
+        "                        {\n",
+        "                            \"type\": \"image\",\n",
+        "                            \"image\": Image.open(\n",
+        "                                IMG_CACHE / f\"P{str(example['id']).rjust(6, '0')}.jpg\"\n",
+        "                            ),\n",
+        "                        },\n",
+        "                        {\"type\": \"text\", \"text\": \"OCR:\"},\n",
+        "                    ],\n",
+        "                },\n",
+        "                # Add assistant message with completion text\n",
+        "                {\n",
+        "                    \"role\": \"assistant\",\n",
+        "                    \"content\": [{\"type\": \"text\", \"text\": example[\"unicode\"]}],\n",
+        "                },\n",
+        "            ],\n",
+        "        },\n",
+        "    ),\n",
+        "    args=sft_training_args,\n",
+        "    train_dataset=train_dataset,\n",
+        "    eval_dataset=test_dataset,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "97e8455e",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "gpu_stats = torch.cuda.get_device_properties(0)\n",
+        "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+        "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n",
+        "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n",
+        "print(f\"{start_gpu_memory} GB of memory reserved.\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "e6103bbe",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "sft_trainer_stats = sft_trainer.train(resume_from_checkpoint=False)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "36dc79b9",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+        "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n",
+        "used_percentage = round(used_memory / max_memory * 100, 3)\n",
+        "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n",
+        "print(f\"{sft_trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
+        "print(\n",
+        "    f\"{round(sft_trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\"\n",
+        ")\n",
+        "print(f\"Peak reserved memory = {used_memory} GB.\")\n",
+        "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n",
+        "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n",
+        "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "50bdf718",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Save model\n",
+        "processor.save_pretrained(sft_training_args.output_dir)\n",
+        "model.save_pretrained(sft_training_args.output_dir, processor)\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": ".venv",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.13.6"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}

training/get_cdli_dataset.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import concurrent.futures
+import math
+import time
+from io import BytesIO
+from pathlib import Path
+import requests
+from convert_atf import ATFConverter
+from datasets import Dataset
+from PIL import Image
+from tqdm.auto import tqdm
+atf_converter = ATFConverter()
+IMG_CACHE = Path("./data/cdli_images")
+IMG_CACHE.mkdir(exist_ok=True, parents=True)
+MAX_IMG_RES = 2048
+DOWNLOAD_MODE = False
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 28 * 28 * 130,
+    max_pixels: int = 28 * 28 * 1280,
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    # if height < factor or width < factor:
+    #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    # if int(height < factor//4) + int(width < factor//4):
+    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
+    if height < factor:
+        print(f"smart_resize: height={height} < factor={factor}, reset height=factor")
+        width = round((width * factor) / height)
+        height = factor
+    if width < factor:
+        print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
+        height = round((height * factor) / width)
+        width = factor
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+def resize_image(img_path):
+    with Image.open(img_path).convert("RGB") as image:
+        width, height = image.size
+        # Scale down if larger than MAX_IMG_RES
+        if width > MAX_IMG_RES or height > MAX_IMG_RES:
+            scale = MAX_IMG_RES / max(width, height)
+            height = int(height * scale)
+            width = int(width * scale)
+        # Always ensure dimensions are multiples of 28 for vision model compatibility
+        new_height, new_width = smart_resize(height, width)
+        if new_height != image.height or new_width != image.width:
+            image = image.resize((new_width, new_height), Image.LANCZOS)
+            image.save(img_path)
+def resize_cached_images():
+    img_paths = list(IMG_CACHE.glob("*.jpg"))
+    pbar = tqdm(img_paths)
+    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
+        futures = [executor.submit(resize_image, img_path) for img_path in img_paths]
+        for future in concurrent.futures.as_completed(futures):
+            pbar.update(1)
+    pbar.close()
+def get_image(id: int):
+    file_name = f"P{str(id).rjust(6, '0')}.jpg"
+    url = f"https://cdli.earth/dl/photo/{file_name}"
+    cache_file = IMG_CACHE / file_name
+    try:
+        if cache_file.exists():
+            tqdm.write(f"Found {file_name} in cache")
+            image = Image.open(cache_file).convert("RGB")
+        else:
+            response = requests.get(url, timeout=5)
+            response.raise_for_status()
+            image = Image.open(BytesIO(response.content)).convert("RGB")
+            tqdm.write(f"Downloaded {file_name}")
+            width, height = image.size
+            # Scale down if larger than MAX_IMG_RES
+            if width > MAX_IMG_RES or height > MAX_IMG_RES:
+                scale = MAX_IMG_RES / max(width, height)
+                height = int(height * scale)
+                width = int(width * scale)
+            # Always ensure dimensions are multiples of 28 for vision model compatibility
+            new_height, new_width = smart_resize(height, width)
+            if new_height != image.height or new_width != image.width:
+                image = image.resize((new_width, new_height), Image.LANCZOS)
+            image.save(cache_file)
+            time.sleep(0.02)  # Rate limiting
+    except requests.exceptions.Timeout:
+        tqdm.write(f"Timeout downloading {file_name}")
+        return None
+    except requests.exceptions.RequestException as e:
+        tqdm.write(f"Error downloading {file_name}: {e}")
+        return None
+    except Exception as e:
+        tqdm.write(f"Error processing {file_name}: {type(e).__name__}: {e}")
+        return None
+    return image
+def count_repetitions(text: str) -> int:
+    """
+    Count the total number of repeated token occurrences in a sequence.
+    E.g., 122233 has 3 repetitions (2 appears 2 extra times, 3 appears 1 extra time).
+    """
+    if len(text) < 2:
+        return 0
+    return len(text) - len(set(text))
+def get_dataset(file="./data/cdli_dataset.parquet"):
+    if Path(file).exists():
+        return Dataset.from_parquet(file).train_test_split(test_size=1000, seed=42)
+    # 1. Get all the ids from cdli.atf (source: https://github.com/cdli-gh/data/raw/refs/heads/master/cdliatf_unblocked.atf)
+    cdli_raw = Path("./data/cdli.atf").read_text(encoding="utf-8").split("&P")
+    cdli_filtered = [
+        section.strip()
+        for section in cdli_raw
+        if section.strip()  # Ignore empty sections
+        and "@tablet" in section  # Only include tablets
+        and len(section) > 50  # Ignore short sections
+        and len(section) < 1000  # Ignore long sections
+        and any(lang in section for lang in ["sux", "akk"])  # Limit supported languages
+    ]
+    ids = []
+    atfs = []
+    unicodes = []
+    for section in tqdm(cdli_filtered, desc="Parsing CDLI dump"):
+        # Split section at first space to get the ID, ignore if not parseable
+        lines = section.splitlines()
+        id_part = lines[0].split("=")[0].strip()
+        if not id_part.isdigit():
+            continue
+        atf = "\n".join(
+            [
+                line
+                for line in lines[1:]
+                if not (
+                    line.startswith("# ")
+                    or line.startswith(">>")
+                    or line.startswith("<<")
+                    or line.startswith("||")
+                )
+            ]
+        )
+        parsed = atf_converter.parse(atf)
+        if parsed is None:
+            tqdm.write(f"=====\033[91m {id_part} skip (parse fail) \033[0m=====")
+            continue
+        unicode_parts = [
+            f"@{face}\n{parsed.get_unicode(face)}"
+            for face in parsed.ALL_FACES
+            if parsed.get_unicode(face)
+        ]
+        # Skip massive tablets
+        unicode_len = sum([len(part) for part in unicode_parts])
+        if unicode_len > 300 or unicode_len < 20:
+            tqdm.write(f"=====\033[91m {id_part} skip (too short/long) \033[0m=====")
+            continue
+        # Skip tablets that are poorly translated to unicode
+        if sum([part.count("x") for part in unicode_parts]) >= 2:
+            tqdm.write(f"=====\033[91m {id_part} skip (missing symbols) \033[0m=====")
+            continue
+        unicode = "\n".join(unicode_parts)
+        # Drop the super repetitive admin tablets (model ends up getting stuck repeating the common phrases)
+        if count_repetitions(unicode) / len(unicode) > 0.7:
+            tqdm.write(f"=====\033[91m {id_part} skip (too repetitive) \033[0m=====")
+            continue
+        # Ignore if we don't have an image for this atf
+        if DOWNLOAD_MODE:
+            image = get_image(int(id_part))
+        elif (IMG_CACHE / f"P{str(int(id_part)).rjust(6, '0')}.jpg").exists():
+            image = Image.open(
+                IMG_CACHE / f"P{str(int(id_part)).rjust(6, '0')}.jpg"
+            ).convert("RGB")
+        else:
+            tqdm.write(f"=====\033[91m {id_part} skip (no img) \033[0m=====")
+            continue
+        if not image:
+            tqdm.write(f"=====\033[91m {id_part} skip (no img) \033[0m=====")
+            continue
+        # Drop low res, B&W, or non-isolated background
+        try:
+            if min(image.size) < 100:
+                tqdm.write(f"=====\033[91m {id_part} skip (lowres) \033[0m=====")
+                continue
+            scale = 150 / image.height
+            small_image = image.resize(
+                (int(image.width * scale), int(image.height * scale)), Image.LANCZOS
+            )
+            pixels = list(small_image.getdata())
+            small_image.close()
+            image.close()
+            bw_pixels = sum(1 for r, g, b in pixels if r == g == b)
+            bw_percent = bw_pixels / len(pixels)
+            if bw_percent > 0.95 or bw_percent < 0.1:
+                tqdm.write(
+                    f"=====\033[91m {id_part} skip (bw {bw_percent*100:.1f}%) \033[0m====="
+                )
+                continue
+            if sum(1 for r, g, b in pixels if r == g == b == 0) / len(pixels) < 0.15:
+                tqdm.write(
+                    f"=====\033[91m {id_part} skip (not on black background) \033[0m====="
+                )
+                continue
+        except Exception as e:
+            tqdm.write(
+                f"=====\033[91m {id_part} skip (err img check: {e}) \033[0m====="
+            )
+            continue
+        ids.append(int(id_part))
+        atfs.append(atf)
+        unicodes.append(unicode)
+        tqdm.write(f"=====\033[32m {id_part} unicode (len {unicode_len}) \033[0m=====")
+    dataset = Dataset.from_dict(
+        {
+            "id": ids,
+            "atf": atfs,
+            "unicode": unicodes,
+        }
+    )
+    dataset.to_parquet(file)
+    return dataset.train_test_split(test_size=1000, seed=42)