File size: 54,859 Bytes

e82ea71

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# GLM-OCR to CoreML Conversion\n",
        "\n",
        "Converts [GLM-OCR](https://huggingface.co/aoiandroid/GLM-OCR) to CoreML for iOS/macOS.\n",
        "\n",
        "**Model**: CogViT visual encoder + cross-modal connector + GLM-0.5B decoder.  \n",
        "**Outputs**:\n",
        "- `vision_encoder.mlpackage` - CogViT encoder (always exported)\n",
        "- `decoder.mlpackage` - GLM-0.5B single-step decoder (if model supports it)\n",
        "- `model_spec.json` - I/O shapes for Swift integration\n",
        "\n",
        "**Requirements**: Python 3.10+, PyTorch, transformers (main branch), coremltools 7.2+.\n",
        "\n",
        "**Run cells top-to-bottom.** Section 5 (quantization) and Section 6 (accuracy check) depend on Section 2."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Dependencies installed. Restart the kernel if you see import errors, then run the next cell.\n"
          ]
        }
      ],
      "source": [
        "# Run this cell first to install dependencies into the current kernel.\n",
        "import subprocess\n",
        "import sys\n",
        "\n",
        "if sys.version_info < (3, 10):\n",
        "    print(\"WARNING: Python 3.10+ is required for transformers main (GLM-OCR). Current:\", sys.version)\n",
        "    print(\"Create a venv with Python 3.10+ and select it as the kernel to avoid 'Unrecognized processing class'.\")\n",
        "\n",
        "def pip_install(*packages, quiet=True):\n",
        "    cmd = [sys.executable, \"-m\", \"pip\", \"install\"] + ([\"-q\"] if quiet else []) + list(packages)\n",
        "    return subprocess.call(cmd) == 0\n",
        "\n",
        "pip_install(\"numpy\", \"pillow\")\n",
        "pip_install(\"torch==2.3.0\", \"torchvision==0.18.0\")\n",
        "\n",
        "# GLM-OCR needs transformers from main. If git install fails, try PyPI (newer PyPI may include GLM-OCR).\n",
        "if not pip_install(\"git+https://github.com/huggingface/transformers.git@main\"):\n",
        "    print(\"Git install failed (check network or build deps). Trying PyPI transformers...\")\n",
        "    pip_install(\"transformers>=4.45.0\")\n",
        "\n",
        "pip_install(\"coremltools==7.2\")\n",
        "pip_install(\"huggingface_hub>=0.23.0\")\n",
        "print(\"Dependencies installed. Restart the kernel if you see import errors, then run the next cell.\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Torch version 2.3.0 has not been tested with coremltools. You may run into unexpected errors. Torch 2.2.0 is the most recent version that has been tested.\n",
            "/Users/am/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
            "  from .autonotebook import tqdm as notebook_tqdm\n",
            "Disabling PyTorch because PyTorch >= 2.4 is required but found 2.3.0\n",
            "PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
          ]
        }
      ],
      "source": [
        "import json\n",
        "import os\n",
        "from pathlib import Path\n",
        "\n",
        "import numpy as np\n",
        "import torch\n",
        "import coremltools as ct\n",
        "from numpy.linalg import norm\n",
        "from PIL import Image\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 1. Load model and processor\n",
        "\n",
        "Using `aoiandroid/GLM-OCR` (mirror of `zai-org/GLM-OCR`).  \n",
        "Install transformers from main branch for GLM-OCR support:  \n",
        "`pip install git+https://github.com/huggingface/transformers.git`"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "PyTorch was not found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
          ]
        },
        {
          "ename": "ImportError",
          "evalue": "\nmodeling_auto requires the PyTorch library but it was not found in your environment. Check out the instructions on the\ninstallation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.\nPlease note that you may need to restart your runtime after installation.\n",
          "output_type": "error",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
            "Cell \u001b[0;32mIn[3], line 7\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m----> 7\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtransformers\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m GlmOcrProcessor, GlmOcrForConditionalGeneration\n\u001b[1;32m      8\u001b[0m     processor \u001b[38;5;241m=\u001b[39m GlmOcrProcessor\u001b[38;5;241m.\u001b[39mfrom_pretrained(MODEL_ID)\n",
            "\u001b[0;31mImportError\u001b[0m: cannot import name 'GlmOcrProcessor' from 'transformers' (/Users/am/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/__init__.py)",
            "\nDuring handling of the above exception, another exception occurred:\n",
            "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
            "Cell \u001b[0;32mIn[3], line 11\u001b[0m\n\u001b[1;32m      9\u001b[0m     model \u001b[38;5;241m=\u001b[39m GlmOcrForConditionalGeneration\u001b[38;5;241m.\u001b[39mfrom_pretrained(MODEL_ID, torch_dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32)\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m:\n\u001b[0;32m---> 11\u001b[0m     \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mtransformers\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m AutoProcessor, AutoModelForImageTextToText\n\u001b[1;32m     12\u001b[0m     processor \u001b[38;5;241m=\u001b[39m AutoProcessor\u001b[38;5;241m.\u001b[39mfrom_pretrained(MODEL_ID, trust_remote_code\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m     13\u001b[0m     model \u001b[38;5;241m=\u001b[39m AutoModelForImageTextToText\u001b[38;5;241m.\u001b[39mfrom_pretrained(MODEL_ID, torch_dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mfloat32, trust_remote_code\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/utils/import_utils.py:2127\u001b[0m, in \u001b[0;36m_LazyModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   2125\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_class_to_module:\n\u001b[1;32m   2126\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2127\u001b[0m         module \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_module\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_class_to_module\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2128\u001b[0m         value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(module, name)\n\u001b[1;32m   2129\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mModuleNotFoundError\u001b[39;00m, \u001b[38;5;167;01mRuntimeError\u001b[39;00m, \u001b[38;5;167;01mAttributeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m   2130\u001b[0m         \u001b[38;5;66;03m# V5: If trying to import a *TokenizerFast symbol, transparently fall back to the\u001b[39;00m\n\u001b[1;32m   2131\u001b[0m         \u001b[38;5;66;03m# non-Fast symbol from the same module when available. This lets us keep only one\u001b[39;00m\n\u001b[1;32m   2132\u001b[0m         \u001b[38;5;66;03m# backend tokenizer class while preserving legacy public names.\u001b[39;00m\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/utils/import_utils.py:2321\u001b[0m, in \u001b[0;36m_LazyModule._get_module\u001b[0;34m(self, module_name)\u001b[0m\n\u001b[1;32m   2319\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m importlib\u001b[38;5;241m.\u001b[39mimport_module(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m module_name, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m)\n\u001b[1;32m   2320\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m-> 2321\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/utils/import_utils.py:2319\u001b[0m, in \u001b[0;36m_LazyModule._get_module\u001b[0;34m(self, module_name)\u001b[0m\n\u001b[1;32m   2317\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_get_module\u001b[39m(\u001b[38;5;28mself\u001b[39m, module_name: \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m   2318\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2319\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mimportlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mimport_module\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m.\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mmodule_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2320\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m   2321\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m e\n",
            "File \u001b[0;32m/opt/homebrew/Cellar/python@3.10/3.10.20/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/__init__.py:126\u001b[0m, in \u001b[0;36mimport_module\u001b[0;34m(name, package)\u001b[0m\n\u001b[1;32m    124\u001b[0m             \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    125\u001b[0m         level \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m--> 126\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_bootstrap\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gcd_import\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m[\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpackage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m)\u001b[49m\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/models/auto/processing_auto.py:40\u001b[0m\n\u001b[1;32m     38\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mfeature_extraction_auto\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m AutoFeatureExtractor\n\u001b[1;32m     39\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mimage_processing_auto\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m AutoImageProcessor\n\u001b[0;32m---> 40\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtokenization_auto\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m AutoTokenizer\n\u001b[1;32m     41\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mvideo_processing_auto\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m AutoVideoProcessor\n\u001b[1;32m     44\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m)\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:36\u001b[0m\n\u001b[1;32m     28\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m     29\u001b[0m     extract_commit_hash,\n\u001b[1;32m     30\u001b[0m     is_g2p_en_available,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     33\u001b[0m     logging,\n\u001b[1;32m     34\u001b[0m )\n\u001b[1;32m     35\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mhub\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m cached_file\n\u001b[0;32m---> 36\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mencoder_decoder\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m EncoderDecoderConfig\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mauto_factory\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m _LazyAutoMapping\n\u001b[1;32m     38\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mconfiguration_auto\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m     39\u001b[0m     CONFIG_MAPPING_NAMES,\n\u001b[1;32m     40\u001b[0m     AutoConfig,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     43\u001b[0m     replace_list_option_in_docstrings,\n\u001b[1;32m     44\u001b[0m )\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/utils/import_utils.py:2127\u001b[0m, in \u001b[0;36m_LazyModule.__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   2125\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_class_to_module:\n\u001b[1;32m   2126\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2127\u001b[0m         module \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_module\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_class_to_module\u001b[49m\u001b[43m[\u001b[49m\u001b[43mname\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2128\u001b[0m         value \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(module, name)\n\u001b[1;32m   2129\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mModuleNotFoundError\u001b[39;00m, \u001b[38;5;167;01mRuntimeError\u001b[39;00m, \u001b[38;5;167;01mAttributeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m   2130\u001b[0m         \u001b[38;5;66;03m# V5: If trying to import a *TokenizerFast symbol, transparently fall back to the\u001b[39;00m\n\u001b[1;32m   2131\u001b[0m         \u001b[38;5;66;03m# non-Fast symbol from the same module when available. This lets us keep only one\u001b[39;00m\n\u001b[1;32m   2132\u001b[0m         \u001b[38;5;66;03m# backend tokenizer class while preserving legacy public names.\u001b[39;00m\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/utils/import_utils.py:2321\u001b[0m, in \u001b[0;36m_LazyModule._get_module\u001b[0;34m(self, module_name)\u001b[0m\n\u001b[1;32m   2319\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m importlib\u001b[38;5;241m.\u001b[39mimport_module(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m module_name, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m)\n\u001b[1;32m   2320\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m-> 2321\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/utils/import_utils.py:2319\u001b[0m, in \u001b[0;36m_LazyModule._get_module\u001b[0;34m(self, module_name)\u001b[0m\n\u001b[1;32m   2317\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_get_module\u001b[39m(\u001b[38;5;28mself\u001b[39m, module_name: \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m   2318\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 2319\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mimportlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mimport_module\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m.\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mmodule_name\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__name__\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2320\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m   2321\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m e\n",
            "File \u001b[0;32m/opt/homebrew/Cellar/python@3.10/3.10.20/Frameworks/Python.framework/Versions/3.10/lib/python3.10/importlib/__init__.py:126\u001b[0m, in \u001b[0;36mimport_module\u001b[0;34m(name, package)\u001b[0m\n\u001b[1;32m    124\u001b[0m             \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m    125\u001b[0m         level \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m--> 126\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_bootstrap\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gcd_import\u001b[49m\u001b[43m(\u001b[49m\u001b[43mname\u001b[49m\u001b[43m[\u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpackage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlevel\u001b[49m\u001b[43m)\u001b[49m\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/models/encoder_decoder/configuration_encoder_decoder.py:26\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mauto\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m AutoConfig\n\u001b[1;32m     22\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mget_logger(\u001b[38;5;18m__name__\u001b[39m)\n\u001b[1;32m     25\u001b[0m \u001b[38;5;129;43m@auto_docstring\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mcheckpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m---> 26\u001b[0m \u001b[38;5;28;43;01mclass\u001b[39;49;00m\u001b[38;5;250;43m \u001b[39;49m\u001b[38;5;21;43;01mEncoderDecoderConfig\u001b[39;49;00m\u001b[43m(\u001b[49m\u001b[43mPreTrainedConfig\u001b[49m\u001b[43m)\u001b[49m\u001b[43m:\u001b[49m\n\u001b[1;32m     27\u001b[0m \u001b[38;5;250;43m    \u001b[39;49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43;03m\"\"\"\u001b[39;49;00m\n\u001b[1;32m     28\u001b[0m \u001b[38;5;124;43;03m    Examples:\u001b[39;49;00m\n\u001b[1;32m     29\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     54\u001b[0m \u001b[38;5;124;43;03m    >>> model = EncoderDecoderModel.from_pretrained(\"my-model\", config=encoder_decoder_config)\u001b[39;49;00m\n\u001b[1;32m     55\u001b[0m \u001b[38;5;124;43;03m    ```\"\"\"\u001b[39;49;00m\n\u001b[1;32m     57\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel_type\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoder-decoder\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/utils/auto_docstring.py:4387\u001b[0m, in \u001b[0;36mauto_docstring.<locals>.auto_docstring_decorator\u001b[0;34m(obj)\u001b[0m\n\u001b[1;32m   4383\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m auto_method_docstring(\n\u001b[1;32m   4384\u001b[0m         obj, custom_args\u001b[38;5;241m=\u001b[39mcustom_args, custom_intro\u001b[38;5;241m=\u001b[39mcustom_intro, checkpoint\u001b[38;5;241m=\u001b[39mcheckpoint\n\u001b[1;32m   4385\u001b[0m     )\n\u001b[1;32m   4386\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 4387\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mauto_class_docstring\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcustom_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcustom_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcustom_intro\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcustom_intro\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcheckpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheckpoint\u001b[49m\u001b[43m)\u001b[49m\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/utils/auto_docstring.py:4135\u001b[0m, in \u001b[0;36mauto_class_docstring\u001b[0;34m(cls, custom_intro, custom_args, checkpoint)\u001b[0m\n\u001b[1;32m   4133\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model_name_lowercase \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   4134\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 4135\u001b[0m         model_base_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m   4136\u001b[0m \u001b[43m            \u001b[49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mauto_module\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mPLACEHOLDER_TO_AUTO_MODULE\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel_class\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4137\u001b[0m \u001b[43m            \u001b[49m\u001b[43mPLACEHOLDER_TO_AUTO_MODULE\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmodel_class\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4138\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m[model_name_lowercase]\n\u001b[1;32m   4139\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m:\n\u001b[1;32m   4140\u001b[0m         \u001b[38;5;28;01mpass\u001b[39;00m\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/utils/import_utils.py:1975\u001b[0m, in \u001b[0;36mDummyObject.__getattribute__\u001b[0;34m(cls, key)\u001b[0m\n\u001b[1;32m   1973\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (key\u001b[38;5;241m.\u001b[39mstartswith(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m key \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_from_config\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m key \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mis_dummy\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m key \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmro\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mor\u001b[39;00m key \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcall\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m   1974\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__getattribute__\u001b[39m(key)\n\u001b[0;32m-> 1975\u001b[0m \u001b[43mrequires_backends\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_backends\u001b[49m\u001b[43m)\u001b[49m\n",
            "File \u001b[0;32m~/Desktop/TranslateBlue/Notebooks/.venv_glm_ocr/lib/python3.10/site-packages/transformers/utils/import_utils.py:1961\u001b[0m, in \u001b[0;36mrequires_backends\u001b[0;34m(obj, backends)\u001b[0m\n\u001b[1;32m   1958\u001b[0m         failed\u001b[38;5;241m.\u001b[39mappend(msg\u001b[38;5;241m.\u001b[39mformat(name))\n\u001b[1;32m   1960\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m failed:\n\u001b[0;32m-> 1961\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(failed))\n",
            "\u001b[0;31mImportError\u001b[0m: \nmodeling_auto requires the PyTorch library but it was not found in your environment. Check out the instructions on the\ninstallation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.\nPlease note that you may need to restart your runtime after installation.\n"
          ]
        }
      ],
      "source": [
        "MODEL_ID = \"aoiandroid/GLM-OCR\"  # or \"zai-org/GLM-OCR\"\n",
        "OUTPUT_DIR = Path(\"./glm_ocr_coreml\")\n",
        "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n",
        "\n",
        "# Prefer transformers main (GlmOcrProcessor). Else try loading from Hub with trust_remote_code.\n",
        "try:\n",
        "    from transformers import GlmOcrProcessor, GlmOcrForConditionalGeneration\n",
        "    processor = GlmOcrProcessor.from_pretrained(MODEL_ID)\n",
        "    model = GlmOcrForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype=torch.float32)\n",
        "except ImportError:\n",
        "    from transformers import AutoProcessor, AutoModelForImageTextToText\n",
        "    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)\n",
        "    model = AutoModelForImageTextToText.from_pretrained(MODEL_ID, torch_dtype=torch.float32, trust_remote_code=True)\n",
        "model.eval()\n",
        "\n",
        "# Resolve image_size and hidden_size from config\n",
        "vision_config = getattr(model.config, \"vision_config\", None)\n",
        "image_size = 336\n",
        "if vision_config is not None:\n",
        "    image_size = getattr(vision_config, \"image_size\", 336)\n",
        "if isinstance(image_size, (list, tuple)):\n",
        "    image_size = image_size[0]\n",
        "hidden_size = (\n",
        "    getattr(model.config, \"hidden_size\", None)\n",
        "    or (\n",
        "        getattr(model.config.text_config, \"hidden_size\", 1024)\n",
        "        if getattr(model.config, \"text_config\", None)\n",
        "        else 1024\n",
        "    )\n",
        ")\n",
        "print(f\"Image size: {image_size}, hidden_size: {hidden_size}\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### 1.1 Model structure validation\n",
        "\n",
        "Verify expected attributes before tracing. Raises immediately if transformers version is incompatible."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "print(\"=== Model structure ===\")\n",
        "print(f\"Model class: {type(model).__name__}\")\n",
        "\n",
        "inner = getattr(model, \"model\", None)\n",
        "if inner is None:\n",
        "    raise RuntimeError(\n",
        "        \"model.model not found. Inspect the loaded model structure with: print(model)\"\n",
        "    )\n",
        "\n",
        "if not hasattr(inner, \"get_image_features\"):\n",
        "    raise RuntimeError(\n",
        "        \"get_image_features not found. Install transformers from main:\\n\"\n",
        "        \"  pip install git+https://github.com/huggingface/transformers.git\"\n",
        "    )\n",
        "\n",
        "print(f\"vision_config: {getattr(model.config, 'vision_config', 'N/A')}\")\n",
        "print(f\"hidden_size  : {getattr(model.config, 'hidden_size', 'N/A')}\")\n",
        "\n",
        "# Locate decoder submodule (for Section 4 decoder export)\n",
        "_decoder_attr = None\n",
        "for _name in [\"language_model\", \"text_model\", \"decoder\"]:\n",
        "    _obj = getattr(model, _name, None) or getattr(inner, _name, None)\n",
        "    if _obj is not None and hasattr(_obj, \"forward\"):\n",
        "        print(f\"Decoder submodule found: '{_name}'\")\n",
        "        _decoder_attr = _name\n",
        "        break\n",
        "if _decoder_attr is None:\n",
        "    print(\"No separate decoder submodule; will use inputs_embeds path in Section 4.\")\n",
        "\n",
        "print(\"Structure validation OK\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 2. Export vision encoder to CoreML\n",
        "\n",
        "Trace `model.model.get_image_features(pixel_values)` to extract the CogViT encoder.  \n",
        "Output shape: `(1, vision_seq_len, hidden_size)`."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "class VisionEncoderWrapper(torch.nn.Module):\n",
        "    \"\"\"pixel_values (1,3,H,W) -> last_hidden_state (1, vision_seq_len, hidden_size).\"\"\"\n",
        "    def __init__(self, parent_model):\n",
        "        super().__init__()\n",
        "        self.base = getattr(parent_model, \"model\", parent_model)\n",
        "        if not hasattr(self.base, \"get_image_features\"):\n",
        "            raise AttributeError(\n",
        "                \"get_image_features not found; ensure transformers supports GLM-OCR.\"\n",
        "            )\n",
        "\n",
        "    def forward(self, pixel_values: torch.Tensor):\n",
        "        out = self.base.get_image_features(pixel_values=pixel_values)\n",
        "        return out.last_hidden_state\n",
        "\n",
        "wrapper = VisionEncoderWrapper(model)\n",
        "wrapper.eval()\n",
        "\n",
        "batch, channels = 1, 3\n",
        "dummy_pixel = torch.randn(batch, channels, image_size, image_size, dtype=torch.float32)\n",
        "\n",
        "with torch.no_grad():\n",
        "    traced = torch.jit.trace(wrapper, (dummy_pixel,), check_trace=False, strict=False)\n",
        "    enc_out = traced(dummy_pixel)\n",
        "\n",
        "# Update hidden_size from actual trace output (overrides config-derived value)\n",
        "vision_seq_len = enc_out.shape[1]\n",
        "hidden_size    = enc_out.shape[2]\n",
        "print(f\"Vision encoder output: {enc_out.shape}  (seq_len={vision_seq_len}, hidden={hidden_size})\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "input_types  = [ct.TensorType(name=\"pixel_values\",\n",
        "                               shape=(1, channels, image_size, image_size),\n",
        "                               dtype=np.float32)]\n",
        "output_types = [ct.TensorType(name=\"vision_hidden_states\")]\n",
        "\n",
        "# iOS16 is recommended minimum for mlprogram.\n",
        "# Lower to iOS15 only after on-device testing confirms compatibility.\n",
        "vision_mlmodel = ct.convert(\n",
        "    traced,\n",
        "    inputs=input_types,\n",
        "    outputs=output_types,\n",
        "    convert_to=\"mlprogram\",\n",
        "    minimum_deployment_target=ct.target.iOS16,\n",
        "    compute_units=ct.ComputeUnit.ALL,\n",
        ")\n",
        "\n",
        "vision_path = OUTPUT_DIR / \"vision_encoder.mlpackage\"\n",
        "vision_mlmodel.save(str(vision_path))\n",
        "print(f\"Saved: {vision_path}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "model_spec: dict = {\n",
        "    \"vision_encoder\": {\n",
        "        \"input\":  {\"name\": \"pixel_values\",\n",
        "                   \"shape\": [1, 3, int(image_size), int(image_size)], \"dtype\": \"float32\"},\n",
        "        \"output\": {\"name\": \"vision_hidden_states\",\n",
        "                   \"shape\": [1, int(vision_seq_len), int(hidden_size)], \"dtype\": \"float32\"},\n",
        "    },\n",
        "    \"image_size\":     int(image_size),\n",
        "    \"vision_seq_len\": int(vision_seq_len),\n",
        "    \"hidden_size\":    int(hidden_size),\n",
        "    \"model_id\":       MODEL_ID,\n",
        "}\n",
        "\n",
        "spec_path = OUTPUT_DIR / \"model_spec.json\"\n",
        "with open(spec_path, \"w\") as f:\n",
        "    json.dump(model_spec, f, indent=2)\n",
        "print(f\"Spec saved: {spec_path}\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 3. Save processor and config\n",
        "\n",
        "Save tokenizer and image processor so the iOS app can preprocess images and decode output tokens."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "processor.save_pretrained(OUTPUT_DIR)\n",
        "model.config.save_pretrained(OUTPUT_DIR)\n",
        "print(f\"Saved processor and config to {OUTPUT_DIR}\")\n",
        "print(\"Contents:\", sorted(p.name for p in OUTPUT_DIR.iterdir()))\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 4. Decoder export (single-step)\n",
        "\n",
        "Export one forward step of the GLM-0.5B decoder:  \n",
        "`(input_ids, encoder_hidden_states, attention_mask) -> logits`\n",
        "\n",
        "The iOS app calls this model in an autoregressive loop to generate text.\n",
        "\n",
        "**Sequence layout**: positions `[0..vision_seq_len-1]` = image tokens (from `encoder_hidden_states`),  \n",
        "positions `[vision_seq_len..end]` = text tokens (embedded from `input_ids`).\n",
        "\n",
        "**Variable-length input**: `ct.RangeDim` allows `input_ids` and `attention_mask` to accept any length  \n",
        "from `vision_seq_len+1` to `DECODER_MAX_LEN`. Pad to `DECODER_MAX_LEN` in Swift and mask padding with `0`.\n",
        "\n",
        "If this section fails, GLM-OCR may not separate vision and text forwards cleanly.  \n",
        "Use the vision encoder only and implement generation in Swift or Python."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "DECODER_MAX_LEN = max(256, int(vision_seq_len) + 128)\n",
        "decoder_exported = False\n",
        "dec_traced = None\n",
        "decoder_path = None  # defined here so cell 21 never raises NameError\n",
        "\n",
        "class DecoderStepWrapper(torch.nn.Module):\n",
        "    \"\"\"One decoder step: (input_ids, encoder_hidden_states, attention_mask) -> logits.\n",
        "\n",
        "    Sequence layout:\n",
        "      positions [0 .. vision_seq_len-1] : image tokens from encoder_hidden_states\n",
        "      positions [vision_seq_len .. end] : text tokens embedded from input_ids\n",
        "    \"\"\"\n",
        "    def __init__(self, parent_model):\n",
        "        super().__init__()\n",
        "        self.inner   = parent_model.model\n",
        "        self.lm_head = parent_model.lm_head\n",
        "        self.embed   = parent_model.get_input_embeddings()\n",
        "\n",
        "    def forward(\n",
        "        self,\n",
        "        input_ids:             torch.Tensor,  # (1, seq_len) int64\n",
        "        encoder_hidden_states: torch.Tensor,  # (1, vision_seq_len, hidden_size)\n",
        "        attention_mask:        torch.Tensor,  # (1, seq_len) int64\n",
        "    ) -> torch.Tensor:\n",
        "        text_start = encoder_hidden_states.shape[1]  # = vision_seq_len\n",
        "        text_len   = input_ids.shape[1] - text_start\n",
        "\n",
        "        if text_len > 0:\n",
        "            # Normal case: concat vision tokens + embedded text tokens\n",
        "            text_emb      = self.embed(input_ids[:, text_start:])\n",
        "            inputs_embeds = torch.cat([encoder_hidden_states, text_emb], dim=1)\n",
        "        else:\n",
        "            # Edge case: no text tokens yet (first step)\n",
        "            inputs_embeds = encoder_hidden_states\n",
        "\n",
        "        out = self.inner(\n",
        "            attention_mask=attention_mask,\n",
        "            inputs_embeds=inputs_embeds,\n",
        "            use_cache=False,\n",
        "        )\n",
        "        return self.lm_head(out.last_hidden_state)\n",
        "\n",
        "try:\n",
        "    dec_wrapper = DecoderStepWrapper(model)\n",
        "    dec_wrapper.eval()\n",
        "\n",
        "    dummy_ids  = torch.randint(0, 1000, (1, DECODER_MAX_LEN), dtype=torch.long)\n",
        "    dummy_enc  = torch.randn(1, vision_seq_len, hidden_size, dtype=torch.float32)\n",
        "    dummy_attn = torch.ones(1, DECODER_MAX_LEN, dtype=torch.long)\n",
        "\n",
        "    with torch.no_grad():\n",
        "        dec_traced = torch.jit.trace(\n",
        "            dec_wrapper,\n",
        "            (dummy_ids, dummy_enc, dummy_attn),\n",
        "            check_trace=False,\n",
        "            strict=False,\n",
        "        )\n",
        "        _dec_out = dec_traced(dummy_ids, dummy_enc, dummy_attn)\n",
        "        print(f\"Decoder trace OK. Output shape: {_dec_out.shape}\")\n",
        "\n",
        "except Exception as e:\n",
        "    print(f\"Decoder trace failed: {e}\")\n",
        "    print(\"Continuing with vision encoder only.\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "if dec_traced is not None:\n",
        "    # RangeDim lets the model accept any seq_len from (vision_seq_len+1) to DECODER_MAX_LEN.\n",
        "    # In Swift, always pad input_ids/attention_mask to the same length and mask padding with 0.\n",
        "    seq_range = ct.RangeDim(\n",
        "        lower_bound=int(vision_seq_len) + 1,\n",
        "        upper_bound=DECODER_MAX_LEN,\n",
        "    )\n",
        "\n",
        "    dec_input_types = [\n",
        "        ct.TensorType(name=\"input_ids\",\n",
        "                      shape=(1, seq_range), dtype=np.int32),\n",
        "        ct.TensorType(name=\"encoder_hidden_states\",\n",
        "                      shape=(1, vision_seq_len, hidden_size), dtype=np.float32),\n",
        "        ct.TensorType(name=\"attention_mask\",\n",
        "                      shape=(1, seq_range), dtype=np.int32),\n",
        "    ]\n",
        "    dec_output_types = [ct.TensorType(name=\"logits\")]\n",
        "\n",
        "    decoder_mlmodel = ct.convert(\n",
        "        dec_traced,\n",
        "        inputs=dec_input_types,\n",
        "        outputs=dec_output_types,\n",
        "        convert_to=\"mlprogram\",\n",
        "        minimum_deployment_target=ct.target.iOS16,\n",
        "        compute_units=ct.ComputeUnit.ALL,\n",
        "    )\n",
        "\n",
        "    decoder_path = OUTPUT_DIR / \"decoder.mlpackage\"\n",
        "    decoder_mlmodel.save(str(decoder_path))\n",
        "    decoder_exported = True\n",
        "    print(f\"Saved: {decoder_path}\")\n",
        "\n",
        "    vocab_size = int(\n",
        "        getattr(model.config, \"vocab_size\", None)\n",
        "        or getattr(getattr(model.config, \"text_config\", None), \"vocab_size\", 59392)\n",
        "        or 59392\n",
        "    )\n",
        "    model_spec[\"decoder\"] = {\n",
        "        \"input\": {\n",
        "            \"input_ids\":             {\"shape\": [1, \"1..DECODER_MAX_LEN\"], \"dtype\": \"int32\"},\n",
        "            \"encoder_hidden_states\": {\"shape\": [1, int(vision_seq_len), int(hidden_size)], \"dtype\": \"float32\"},\n",
        "            \"attention_mask\":        {\"shape\": [1, \"1..DECODER_MAX_LEN\"], \"dtype\": \"int32\"},\n",
        "        },\n",
        "        \"output\": {\"name\": \"logits\", \"shape\": [1, \"seq_len\", vocab_size]},\n",
        "        \"decoder_max_len\": DECODER_MAX_LEN,\n",
        "        \"note\": \"Pad input_ids and attention_mask to the same length; mask padding with 0.\",\n",
        "    }\n",
        "    with open(spec_path, \"w\") as f:\n",
        "        json.dump(model_spec, f, indent=2)\n",
        "    print(\"model_spec.json updated with decoder I/O.\")\n",
        "else:\n",
        "    print(\"Decoder not exported; model_spec.json unchanged.\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 5. Quantization (FP16 / INT8) and size comparison\n",
        "\n",
        "Reduce `vision_encoder.mlpackage` size for App Store distribution.  \n",
        "Run **Section 6 (accuracy verification)** after INT8 quantization.\n",
        "\n",
        "Both methods use `coremltools.optimize.coreml` (correct API for `mlprogram` format).  \n",
        "The legacy `neural_network.quantization_utils` API does **not** work with mlprogram.\n",
        "\n",
        "| Method | Expected size reduction |\n",
        "|--------|------------------------|\n",
        "| FP16   | ~50%                   |\n",
        "| INT8   | ~75%                   |"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from coremltools.optimize.coreml import (\n",
        "    linear_quantize_weights,\n",
        "    OptimizationConfig,\n",
        "    OpLinearQuantizerConfig,\n",
        ")\n",
        "\n",
        "def _dir_size_mb(p: Path) -> float:\n",
        "    return sum(f.stat().st_size for f in p.rglob(\"*\") if f.is_file()) / 1e6\n",
        "\n",
        "# FP16 -- minimal accuracy loss, ~50% smaller\n",
        "config_fp16 = OptimizationConfig(\n",
        "    global_config=OpLinearQuantizerConfig(\n",
        "        mode=\"linear_symmetric\",\n",
        "        dtype=np.float16,\n",
        "        weight_threshold=512,\n",
        "    )\n",
        ")\n",
        "vision_fp16_path = None\n",
        "try:\n",
        "    vision_fp16 = linear_quantize_weights(vision_mlmodel, config_fp16)\n",
        "    vision_fp16_path = OUTPUT_DIR / \"vision_encoder_fp16.mlpackage\"\n",
        "    vision_fp16.save(str(vision_fp16_path))\n",
        "    print(f\"FP16 saved: {vision_fp16_path}\")\n",
        "except Exception as e:\n",
        "    print(f\"FP16 quantization failed: {e}\")\n",
        "\n",
        "# INT8 -- more compact; verify accuracy in Section 6\n",
        "# dtype=np.int8 is explicit to avoid version-dependent default behaviour.\n",
        "config_int8 = OptimizationConfig(\n",
        "    global_config=OpLinearQuantizerConfig(\n",
        "        mode=\"linear_symmetric\",\n",
        "        dtype=np.int8,  # explicit: prevents coremltools version skew defaulting to float16\n",
        "        weight_threshold=512,\n",
        "    )\n",
        ")\n",
        "vision_int8_path = None\n",
        "try:\n",
        "    vision_int8 = linear_quantize_weights(vision_mlmodel, config_int8)\n",
        "    vision_int8_path = OUTPUT_DIR / \"vision_encoder_int8.mlpackage\"\n",
        "    vision_int8.save(str(vision_int8_path))\n",
        "    print(f\"INT8 saved: {vision_int8_path}\")\n",
        "except Exception as e:\n",
        "    print(f\"INT8 quantization failed: {e}\")\n",
        "\n",
        "print(\"\\n=== Size comparison ===\")\n",
        "for label, path in [\n",
        "    (\"FP32 (original)\", vision_path),\n",
        "    (\"FP16\",            vision_fp16_path),\n",
        "    (\"INT8\",            vision_int8_path),\n",
        "]:\n",
        "    if path is not None and Path(str(path)).exists():\n",
        "        print(f\"  {label:<20}: {_dir_size_mb(Path(str(path))):.1f} MB\")\n",
        "    else:\n",
        "        print(f\"  {label:<20}: (not available)\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 6. Accuracy verification (PyTorch vs CoreML)\n",
        "\n",
        "Compare per-token cosine similarity between PyTorch traced model and CoreML FP32 model.  \n",
        "Expected: mean cosine similarity > 0.999.  \n",
        "Place a real text image at `test_image.png` for a meaningful check; random tensor is used otherwise.  \n",
        "INT8 quantized model is also verified if available."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def cosine_similarity_stats(a: np.ndarray, b: np.ndarray):\n",
        "    \"\"\"Per-token cosine similarity between two (1, T, D) arrays. Returns (mean, min).\"\"\"\n",
        "    sims = []\n",
        "    for i in range(a.shape[1]):\n",
        "        u, v = a[0, i], b[0, i]\n",
        "        denom = norm(u) * norm(v)\n",
        "        sims.append(float(np.dot(u, v) / denom) if denom > 0 else 1.0)\n",
        "    return float(np.mean(sims)), float(np.min(sims))\n",
        "\n",
        "# Load test image\n",
        "test_image_path = Path(\"test_image.png\")\n",
        "if test_image_path.exists():\n",
        "    _img = Image.open(test_image_path).convert(\"RGB\")\n",
        "    _inputs = processor(images=_img, return_tensors=\"pt\")\n",
        "    pixel_values = _inputs[\"pixel_values\"].to(torch.float32)\n",
        "    if pixel_values.shape[2] != image_size or pixel_values.shape[3] != image_size:\n",
        "        pixel_values = torch.nn.functional.interpolate(\n",
        "            pixel_values, size=(image_size, image_size),\n",
        "            mode=\"bilinear\", align_corners=False,\n",
        "        )\n",
        "    print(f\"Loaded: {test_image_path}\")\n",
        "else:\n",
        "    pixel_values = torch.randn(1, 3, image_size, image_size, dtype=torch.float32)\n",
        "    print(\"test_image.png not found -- using random tensor (shape verification only).\")\n",
        "\n",
        "# PyTorch baseline\n",
        "with torch.no_grad():\n",
        "    pt_out = traced(pixel_values).numpy()  # (1, vision_seq_len, hidden_size)\n",
        "\n",
        "pv_np = pixel_values.numpy()\n",
        "\n",
        "# CoreML FP32\n",
        "coreml_out = vision_mlmodel.predict({\"pixel_values\": pv_np})[\"vision_hidden_states\"]\n",
        "m32, n32 = cosine_similarity_stats(pt_out, coreml_out)\n",
        "print(f\"PyTorch vs CoreML FP32  -- mean cosine: {m32:.6f}, min: {n32:.6f}\")\n",
        "assert m32 > 0.999, f\"FP32 accuracy too low ({m32:.6f}); check conversion settings.\"\n",
        "print(\"FP32 accuracy OK\")\n",
        "\n",
        "# CoreML INT8 (if available)\n",
        "if vision_int8_path and Path(str(vision_int8_path)).exists():\n",
        "    _int8_model = ct.models.MLModel(str(vision_int8_path))\n",
        "    int8_out = _int8_model.predict({\"pixel_values\": pv_np})[\"vision_hidden_states\"]\n",
        "    m8, n8 = cosine_similarity_stats(pt_out, int8_out)\n",
        "    print(f\"PyTorch vs CoreML INT8  -- mean cosine: {m8:.6f}, min: {n8:.6f}\")\n",
        "    if m8 < 0.99:\n",
        "        print(f\"WARNING: INT8 accuracy low ({m8:.4f}). Consider using FP16 instead.\")\n",
        "    else:\n",
        "        print(\"INT8 accuracy OK\")\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 7. Verify CoreML I/O\n",
        "\n",
        "Inspect input/output names and shapes for Xcode integration."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "loaded_v = ct.models.MLModel(str(vision_path))\n",
        "spec_v   = loaded_v.get_spec()\n",
        "print(\"Vision encoder inputs :\", [d.name for d in spec_v.description.input])\n",
        "print(\"Vision encoder outputs:\", [d.name for d in spec_v.description.output])\n",
        "\n",
        "if decoder_exported:\n",
        "    loaded_d = ct.models.MLModel(str(decoder_path))\n",
        "    spec_d   = loaded_d.get_spec()\n",
        "    print(\"Decoder inputs :\", [d.name for d in spec_d.description.input])\n",
        "    print(\"Decoder outputs:\", [d.name for d in spec_d.description.output])\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 8. Swift integration sketch\n",
        "\n",
        "Add `vision_encoder.mlpackage` (and `decoder.mlpackage` if exported) to the Xcode project.  \n",
        "`model_spec.json` contains the exact I/O shapes needed for both models."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "swift_sketch = \"\"\"\n",
        "// Swift: GLM-OCR inference with CoreML\n",
        "//\n",
        "// Setup:\n",
        "//   1. Add vision_encoder.mlpackage to Xcode (auto-generates VisionEncoder class).\n",
        "//   2. If decoder.mlpackage was exported, add it (auto-generates DecoderStep class).\n",
        "//   3. Read model_spec.json at runtime to get vision_seq_len, hidden_size, DECODER_MAX_LEN.\n",
        "//\n",
        "// --- Vision encoder ---\n",
        "// let visionModel  = try VisionEncoder(configuration: .init())\n",
        "// let pixelValues  = preprocessImage(uiImage, size: 336)  // MLMultiArray (1,3,336,336) Float32\n",
        "// let visInput     = VisionEncoderInput(pixel_values: pixelValues)\n",
        "// let visOutput    = try visionModel.prediction(input: visInput)\n",
        "// let hiddenStates = visOutput.vision_hidden_states  // MLMultiArray (1, vision_seq_len, hidden_size)\n",
        "//\n",
        "// --- Autoregressive decoding loop (requires decoder.mlpackage) ---\n",
        "// var tokenIds    = [Int32](repeating: 0, count: DECODER_MAX_LEN)\n",
        "// var attnMask    = [Int32](repeating: 0, count: DECODER_MAX_LEN)\n",
        "// for i in 0..<vision_seq_len { attnMask[i] = 1 }  // unmask image positions\n",
        "// var pos = vision_seq_len\n",
        "// let decoderModel = try DecoderStep(configuration: .init())\n",
        "//\n",
        "// while pos < DECODER_MAX_LEN {\n",
        "//     attnMask[pos] = 1                  // unmask current position BEFORE inference\n",
        "//     let decInput  = DecoderStepInput(\n",
        "//         input_ids:             MLMultiArray(tokenIds),\n",
        "//         encoder_hidden_states: hiddenStates,\n",
        "//         attention_mask:        MLMultiArray(attnMask)\n",
        "//     )\n",
        "//     let logits    = try decoderModel.prediction(input: decInput).logits\n",
        "//     let nextToken = argmax(logits, at: pos)  // read logit at current position\n",
        "//     if nextToken == eosTokenId { break }\n",
        "//     tokenIds[pos] = Int32(nextToken)\n",
        "//     pos += 1\n",
        "// }\n",
        "// let outputText = tokenizer.decode(tokenIds.prefix(pos))\n",
        "\"\"\"\n",
        "print(swift_sketch)\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": ".venv_glm_ocr",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.20"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}