psyche

Ji-Ha commited on 6 days ago

Commit

d8268c5

0 Parent(s):

Duplicate from Ji-Ha/glm-ocr-onnx

Browse files

Co-authored-by: Ji-Ha <Ji-Ha@users.noreply.huggingface.co>

Files changed (28) hide show

.gitattributes +2 -0
README.md +92 -0
chat_template.jinja +140 -0
config.json +65 -0
fp16/glm_ocr_decode_prefill_kv.onnx +3 -0
fp16/glm_ocr_decode_prefill_kv.onnx.data +3 -0
fp16/glm_ocr_decode_step_kv.onnx +3 -0
fp16/glm_ocr_decode_step_kv.onnx.data +3 -0
fp16/glm_ocr_embed.onnx +3 -0
fp16/glm_ocr_embed.onnx.data +3 -0
fp16/glm_ocr_rope_document.onnx +3 -0
fp16/glm_ocr_rope_document.onnx.data +0 -0
fp16/glm_ocr_rope_formula.onnx +3 -0
fp16/glm_ocr_rope_formula.onnx.data +0 -0
fp16/glm_ocr_rope_table.onnx +3 -0
fp16/glm_ocr_rope_table.onnx.data +0 -0
fp16/glm_ocr_rope_text.onnx +3 -0
fp16/glm_ocr_rope_text.onnx.data +0 -0
fp16/glm_ocr_vision.onnx +3 -0
fp16/glm_ocr_vision.onnx.data +3 -0
generation_config.json +10 -0
manifest.json +145 -0
manifest.web.json +113 -0
preprocessor_config.json +11 -0
quant/glm_ocr_vision_quant.onnx +3 -0
quant/glm_ocr_vision_quant.onnx.data +3 -0
tokenizer.json +0 -0
tokenizer_config.json +49 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.onnx filter=lfs diff=lfs merge=lfs -text
2	+ *.data filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,92 @@

+---
+license: mit
+language:
+- zh
+- en
+- fr
+- es
+- ru
+- de
+- ja
+- ko
+pipeline_tag: image-to-text
+library_name: onnxruntime
+base_model:
+- zai-org/GLM-OCR
+---
+# GLM-OCR ONNX (Static Split, Edge-Oriented)
+This repository contains a production-oriented ONNX export/bundle for GLM-OCR with static graph wiring and quantization-aware layout for edge/browser deployment workflows.
+## Credits and Upstream
+- Original model and research release: `zai-org/GLM-OCR`
+  - Hugging Face: https://huggingface.co/zai-org/GLM-OCR
+  - GitHub: https://github.com/zai-org/GLM-OCR
+- This repo is a deployment/export artifact built from the upstream model for ONNX static inference pipelines.
+Please cite and credit the original GLM-OCR authors for model architecture, training, and benchmark claims.
+## What Is Included
+- `manifest.json`: runtime manifest for static Python/ONNX flows.
+- `manifest.web.json`: ORT Web (WASM/WebGPU) wiring manifest.
+- `fp16/`: core fp16 split graphs and external weight shards.
+- `quant/`: quantized vision graph (`vision_quant`) and external shard.
+The bundle is organized so quantized assets are clearly separated from fp16 assets.
+## Notes on Quality and Optimization
+- Primary quality baseline is upstream GLM-OCR behavior, with quality-preserving deployment optimizations.
+- Quantization is applied selectively (not blanket full-model int8) to avoid OCR quality degradation on difficult layouts.
+- `vision_quant` is provided as an optional path, while fp16 vision remains available.
+## Python Inference Example
+Use your static runner with `manifest.json` from this model repo.
+```bash
+python run_onnx_static.py \
+  --artifact_dir . \
+  --image ./examples/source/page.png \
+  --task document \
+  --device cuda \
+  --cuda_no_fallback \
+  --official_quality \
+  --vision_policy table_quant \
+  --out_text ./pred.md
+```
+`--vision_policy table_quant` keeps conservative quality defaults for document/text while using quantized vision where appropriate for tables.
+## Browser / ORT Web (WASM-WebGPU)
+Use `manifest.web.json` for session graph wiring.
+- For constrained clients, prefer hybrid/server-assisted profiles in the manifest.
+- Full in-browser loading of all graphs may exceed practical memory on many devices.
+Minimal JS loading sketch:
+```ts
+import * as ort from "onnxruntime-web";
+const manifest = await fetch("manifest.web.json").then((r) => r.json());
+const visionPath = manifest.graphs.vision; // or manifest.graphs.vision_quant
+const session = await ort.InferenceSession.create(visionPath, {
+  executionProviders: ["webgpu"], // fallback to "wasm" when needed
+});
+```
+## Hugging Face Model Repo Upload Tips
+- Track `.onnx` and `.data` files with Git LFS.
+- Upload all of `fp16/`, `quant/`, and both manifest files together.
+- If a web app will fetch directly from this model repo, configure CORS on the app side accordingly.
+## License
+This deployment artifact follows the upstream GLM-OCR license metadata (`MIT` at time of packaging).
+Always verify upstream license/terms at: https://huggingface.co/zai-org/GLM-OCR

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,140 @@

+[gMASK]<sop>
+{%- if tools -%}
+<|system|>
+# Tools
+You may call one or more functions to assist with the user query.
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{% for tool in tools %}
+{{ tool | tojson(ensure_ascii=False) }}
+{% endfor %}
+</tools>
+For each function call, output the function name and arguments within the following XML format:
+<tool_call>{function-name}
+<arg_key>{arg-key-1}</arg_key>
+<arg_value>{arg-value-1}</arg_value>
+<arg_key>{arg-key-2}</arg_key>
+<arg_value>{arg-value-2}</arg_value>
+...
+</tool_call>{%- endif -%}
+{%- macro visible_text(content) -%}
+    {%- if content is string -%}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping -%}
+        {%- for item in content -%}
+            {%- if item is mapping and item.type == 'text' -%}
+                {{- item.text }}
+            {%- elif item is mapping and (item.type == 'image' or 'image' in item) -%}
+                <|begin_of_image|><|image|><|end_of_image|>
+            {%- elif item is mapping and (item.type == 'video' or 'video' in item) -%}
+                <|begin_of_video|><|video|><|end_of_video|>
+            {%- elif item is string -%}
+                {{- item }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{- content }}
+    {%- endif -%}
+{%- endmacro -%}
+{%- set ns = namespace(last_user_index=-1) %}
+{%- for m in messages %}
+    {%- if m.role == 'user' %}
+        {% set ns.last_user_index = loop.index0 -%}
+    {%- endif %}
+{%- endfor %}
+{% for m in messages %}
+{%- if m.role == 'user' -%}<|user|>
+{% if m.content is string %}
+{{ m.content }}
+{%- else %}
+{%- for item in m.content %}
+{% if item.type == 'video' or 'video' in item %}
+<|begin_of_video|><|video|><|end_of_video|>{% elif item.type == 'image' or 'image' in item %}
+<|begin_of_image|><|image|><|end_of_image|>{% elif item.type == 'text' %}
+{{ item.text }}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}
+{%- elif m.role == 'assistant' -%}
+<|assistant|>
+{%- set reasoning_content = '' %}
+{%- set content = visible_text(m.content) %}
+{%- if m.reasoning_content is string %}
+    {%- set reasoning_content = m.reasoning_content %}
+{%- else %}
+    {%- if '</think>' in content %}
+        {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+        {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+    {%- endif %}
+{%- endif %}
+{%- if loop.index0 > ns.last_user_index and reasoning_content -%}
+{{ '\n<think>' + reasoning_content.strip() +  '</think>'}}
+{%- else -%}
+{{ '\n<think></think>' }}
+{%- endif -%}
+{%- if content.strip() -%}
+{{ '\n' + content.strip() }}
+{%- endif -%}
+{% if m.tool_calls %}
+{% for tc in m.tool_calls %}
+{%- if tc.function %}
+    {%- set tc = tc.function %}
+{%- endif %}
+{{ '\n<tool_call>' + tc.name }}
+{% set _args = tc.arguments %}
+{% for k, v in _args.items() %}
+<arg_key>{{ k }}</arg_key>
+<arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>
+{% endfor %}
+</tool_call>{% endfor %}
+{% endif %}
+{%- elif m.role == 'tool' -%}
+{%- if m.content is string -%}
+{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+    {{- '<|observation|>' }}
+{%- endif %}
+{{- '\n<tool_response>\n' }}
+{{- m.content }}
+{{- '\n</tool_response>' }}
+{% elif m.content is iterable and m.content is not mapping %}
+{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+{{- '<|observation|>' }}
+{%- endif %}
+{{- '\n<tool_response>\n' }}
+{%- for tr in m.content -%}
+  {%- if tr is mapping and tr.type is defined -%}
+    {%- set t = tr.type | lower -%}
+    {%- if t == 'text' and tr.text is defined -%}
+{{ tr.text }}
+    {%- elif t in ['image', 'image_url'] -%}
+<|begin_of_image|><|image|><|end_of_image|>
+    {%- elif t in ['video', 'video_url'] -%}
+<|begin_of_video|><|video|><|end_of_video|>
+    {%- else -%}
+{{ tr | tojson(ensure_ascii=False) }}
+    {%- endif -%}
+  {%- else -%}
+{{ tr.output if tr.output is defined else tr }}
+  {%- endif -%}
+{%- endfor -%}
+{{- '\n</tool_response>' }}
+{%- else -%}
+<|observation|>{% for tr in m.content %}
+<tool_response>
+{{ tr.output if tr.output is defined else tr }}
+</tool_response>{% endfor -%}
+{% endif -%}
+{%- elif m.role == 'system' -%}
+<|system|>
+{{ visible_text(m.content) }}
+{%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+<|assistant|>
+{{'<think></think>\n' if (enable_thinking is defined and not enable_thinking) else ''}}
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "architectures": [
+    "GlmOcrForConditionalGeneration"
+  ],
+  "model_type": "glm_ocr",
+  "text_config": {
+    "model_type": "glm_ocr_text",
+    "pad_token_id": 59246,
+    "vocab_size": 59392,
+    "eos_token_id": [
+      59246,
+      59253
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 1536,
+    "initializer_range": 0.02,
+    "intermediate_size": 4608,
+    "max_position_embeddings": 131072,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 16,
+    "num_nextn_predict_layers": 1,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-05,
+    "dtype": "bfloat16",
+    "rope_parameters": {
+      "rope_type": "default",
+      "mrope_section": [
+        16,
+        24,
+        24
+      ],
+       "partial_rotary_factor": 1.0,
+       "rope_theta": 10000
+    },
+    "tie_word_embeddings": false,
+    "use_cache": true
+  },
+  "vision_config": {
+    "model_type": "glm_ocr_vision",
+    "hidden_size": 1024,
+    "depth": 24,
+    "num_heads": 16,
+    "attention_bias": true,
+    "intermediate_size": 4096,
+    "hidden_act": "silu",
+    "hidden_dropout_prob": 0.0,
+    "initializer_range": 0.02,
+    "image_size": 336,
+    "patch_size": 14,
+    "out_hidden_size": 1536,
+    "rms_norm_eps": 1e-05,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "image_start_token_id": 59256,
+  "image_end_token_id": 59257,
+  "video_start_token_id": 59258,
+  "video_end_token_id": 59259,
+  "image_token_id": 59280,
+  "video_token_id": 59281,
+  "transformers_version": "5.0.1dev0"
+}

fp16/glm_ocr_decode_prefill_kv.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e469aac93dff5835c621d1e7a8987aa802b112d8092b780d4742a317534cd627
+size 2551204

fp16/glm_ocr_decode_prefill_kv.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bd63f01dac7b06ee4920b7c90d38cfc081988f1eb93b04eaa684c6e923e5f360
+size 846466048

fp16/glm_ocr_decode_step_kv.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cea69c66eee2ca5e2e1e62185fd50bae0b431735027d107d070a4d8e02f4a318
+size 4162536

fp16/glm_ocr_decode_step_kv.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3a254982459a269a566bdcaa574f9edb4c77fb470c0745e32a36e6ecc1ca927
+size 2328349696

fp16/glm_ocr_embed.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:700497f7747ce77b2b34f519b662d2bee68f44e638de9391301a74011fd2bf20
+size 1791

fp16/glm_ocr_embed.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44c2c5e0a2a8d65605f06897249a03fb1d11051aee155ba598e1d1a302ababd0
+size 364904448

fp16/glm_ocr_rope_document.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57bbac5220e5828a5ea0bf901974bc44afa190e3e7d9927bceca3ca8a63c4dc3
+size 4170

fp16/glm_ocr_rope_document.onnx.data ADDED Viewed

Binary file (98.3 kB). View file

fp16/glm_ocr_rope_formula.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de0c135e8dd941ec1a0246384d20c20984eb2bbed8fd08b43830ba40c081f232
+size 4169

fp16/glm_ocr_rope_formula.onnx.data ADDED Viewed

Binary file (98.3 kB). View file

fp16/glm_ocr_rope_table.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88faf6507c9baa52f4069b4a6469c5ab692038a538ef3d9df52ad2408648ba22
+size 4167

fp16/glm_ocr_rope_table.onnx.data ADDED Viewed

Binary file (98.3 kB). View file

fp16/glm_ocr_rope_text.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b68daba410b29a635de18b135212569db1c28add28fbe7c5b63dcd0c7b3d0b63
+size 4166

fp16/glm_ocr_rope_text.onnx.data ADDED Viewed

Binary file (98.3 kB). View file

fp16/glm_ocr_vision.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:efa009d7c358e4ebf83515b80a96fd828a05e40bd86be190dfabd9fc0f29bc73
+size 715251

fp16/glm_ocr_vision.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8691969b71702daabc378e031b66c62d13d054a7cfed17e2729ea707079d2b45
+size 14571213824

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "do_sample": false,
+  "eos_token_id": [
+    59246,
+    59253
+  ],
+  "pad_token_id": 59246,
+  "transformers_version": "5.0.1dev0"
+}

manifest.json ADDED Viewed

	@@ -0,0 +1,145 @@

+{
+  "model_id": "zai-org/GLM-OCR",
+  "dtype": "float16",
+  "opset": 18,
+  "static": true,
+  "external_data": true,
+  "image_size": {
+    "width": 840,
+    "height": 840
+  },
+  "max_seq_len": 2048,
+  "image_token_id": 59280,
+  "eos_token_ids": [
+    59246,
+    59253
+  ],
+  "export_prompt": "Recognize the text in the image and output in Markdown format. Preserve the original layout (headings/paragraphs/tables/formulas). Do not fabricate content that does not exist in the image.",
+  "hidden_size": 1536,
+  "t_img": 900,
+  "export_devices": {
+    "vision": "cuda",
+    "text": "cuda"
+  },
+  "graphs": {
+    "vision": "fp16/glm_ocr_vision.onnx",
+    "embed": "fp16/glm_ocr_embed.onnx",
+    "rope": "fp16/glm_ocr_rope_document.onnx",
+    "decode_prefill_kv": "fp16/glm_ocr_decode_prefill_kv.onnx",
+    "decode_step_kv": "fp16/glm_ocr_decode_step_kv.onnx",
+    "vision_quant": "quant/glm_ocr_vision_quant.onnx"
+  },
+  "kv_cache": {
+    "num_layers": 16,
+    "num_key_value_heads": 8,
+    "head_dim": 96,
+    "max_cache_len": 2048
+  },
+  "default_profile": "document",
+  "prompt_profiles": {
+    "document": {
+      "prompt": "Recognize the text in the image and output in Markdown format. Preserve the original layout (headings/paragraphs/tables/formulas). Do not fabricate content that does not exist in the image.",
+      "rope": "fp16/glm_ocr_rope_document.onnx"
+    },
+    "text": {
+      "prompt": "Text Recognition:",
+      "rope": "fp16/glm_ocr_rope_text.onnx"
+    },
+    "table": {
+      "prompt": "Table Recognition:",
+      "rope": "fp16/glm_ocr_rope_table.onnx"
+    },
+    "formula": {
+      "prompt": "Formula Recognition:",
+      "rope": "fp16/glm_ocr_rope_formula.onnx"
+    }
+  },
+  "notes": [
+    "Vision wrapper handles packed [T,D] outputs by unsqueezing to [1,T,D].",
+    "Rope graphs are prompt-profile specific constants generated from get_rope_index (mRoPE).",
+    "Do splice in JS: replace contiguous image_token_id block of length t_img with image_embeds.",
+    "Decode outputs logits for last token only.",
+    "custom_w8: quantized MatMul/Gemm weights for graph 'decode_prefill_kv'.",
+    "dual-vision artifact: graphs.vision=fp16 and graphs.vision_quant=quantized."
+  ],
+  "vision": {
+    "onnx": "artifact_glm_ocr_web_split/fp16/glm_ocr_vision.onnx",
+    "sha256": "efa009d7c358e4ebf83515b80a96fd828a05e40bd86be190dfabd9fc0f29bc73",
+    "bytes": 715251,
+    "data": "artifact_glm_ocr_web_split/fp16/glm_ocr_vision.onnx.data",
+    "data_sha256": "8691969b71702daabc378e031b66c62d13d054a7cfed17e2729ea707079d2b45",
+    "data_bytes": 14571213824
+  },
+  "embed": {
+    "onnx": "artifact_glm_ocr_web_split/fp16/glm_ocr_embed.onnx",
+    "sha256": "700497f7747ce77b2b34f519b662d2bee68f44e638de9391301a74011fd2bf20",
+    "bytes": 1791,
+    "data": "artifact_glm_ocr_web_split/fp16/glm_ocr_embed.onnx.data",
+    "data_sha256": "44c2c5e0a2a8d65605f06897249a03fb1d11051aee155ba598e1d1a302ababd0",
+    "data_bytes": 364904448
+  },
+  "rope": {
+    "onnx": "artifact_glm_ocr_web_split/fp16/glm_ocr_rope_document.onnx",
+    "sha256": "57bbac5220e5828a5ea0bf901974bc44afa190e3e7d9927bceca3ca8a63c4dc3",
+    "bytes": 4170,
+    "data": "artifact_glm_ocr_web_split/fp16/glm_ocr_rope_document.onnx.data",
+    "data_sha256": "cb38103e3aceeb1adeb7104611bd035656dd8433b3c03eb96ac2b5e5df9b55b4",
+    "data_bytes": 98304
+  },
+  "decode": {
+    "onnx": "artifact_glm_ocr_web_split/fp16/glm_ocr_decode.onnx",
+    "sha256": "6d536a5b1e671e2229bb88a050c747ad6fdc9db3292927af4552c33928a46001",
+    "bytes": 2429565,
+    "data": "artifact_glm_ocr_web_split/fp16/glm_ocr_decode.onnx.data",
+    "data_sha256": "6847fe2bfe59df72975856dc61730b2ecb8a0f79455a898375fc4f7c2c7cb2da",
+    "data_bytes": 2328431616
+  },
+  "decode_prefill_kv": {
+    "onnx": "artifact_glm_ocr_web_split/fp16/glm_ocr_decode_prefill_kv.onnx",
+    "sha256": "27974f2680f37205f1a51244dbe7fc65e0eefbcf926a3fc9e7a34a5095d4a755",
+    "bytes": 2547577,
+    "data": "artifact_glm_ocr_web_split/fp16/glm_ocr_decode_prefill_kv.onnx.data",
+    "data_sha256": "6847fe2bfe59df72975856dc61730b2ecb8a0f79455a898375fc4f7c2c7cb2da",
+    "data_bytes": 2328431616
+  },
+  "decode_step_kv": {
+    "onnx": "artifact_glm_ocr_web_split/fp16/glm_ocr_decode_step_kv.onnx",
+    "sha256": "cea69c66eee2ca5e2e1e62185fd50bae0b431735027d107d070a4d8e02f4a318",
+    "bytes": 4162536,
+    "data": "artifact_glm_ocr_web_split/fp16/glm_ocr_decode_step_kv.onnx.data",
+    "data_sha256": "b3a254982459a269a566bdcaa574f9edb4c77fb470c0745e32a36e6ecc1ca927",
+    "data_bytes": 2328349696
+  },
+  "rope_document": {
+    "onnx": "artifact_glm_ocr_web_split/fp16/glm_ocr_rope_document.onnx",
+    "sha256": "57bbac5220e5828a5ea0bf901974bc44afa190e3e7d9927bceca3ca8a63c4dc3",
+    "bytes": 4170,
+    "data": "artifact_glm_ocr_web_split/fp16/glm_ocr_rope_document.onnx.data",
+    "data_sha256": "cb38103e3aceeb1adeb7104611bd035656dd8433b3c03eb96ac2b5e5df9b55b4",
+    "data_bytes": 98304
+  },
+  "rope_text": {
+    "onnx": "artifact_glm_ocr_web_split/fp16/glm_ocr_rope_text.onnx",
+    "sha256": "b68daba410b29a635de18b135212569db1c28add28fbe7c5b63dcd0c7b3d0b63",
+    "bytes": 4166,
+    "data": "artifact_glm_ocr_web_split/fp16/glm_ocr_rope_text.onnx.data",
+    "data_sha256": "4147cb50328e4bb79a289496b2d0dea536c02898fc62591fad72da0b636a33ee",
+    "data_bytes": 98304
+  },
+  "rope_table": {
+    "onnx": "artifact_glm_ocr_web_split/fp16/glm_ocr_rope_table.onnx",
+    "sha256": "88faf6507c9baa52f4069b4a6469c5ab692038a538ef3d9df52ad2408648ba22",
+    "bytes": 4167,
+    "data": "artifact_glm_ocr_web_split/fp16/glm_ocr_rope_table.onnx.data",
+    "data_sha256": "4147cb50328e4bb79a289496b2d0dea536c02898fc62591fad72da0b636a33ee",
+    "data_bytes": 98304
+  },
+  "rope_formula": {
+    "onnx": "artifact_glm_ocr_web_split/fp16/glm_ocr_rope_formula.onnx",
+    "sha256": "de0c135e8dd941ec1a0246384d20c20984eb2bbed8fd08b43830ba40c081f232",
+    "bytes": 4169,
+    "data": "artifact_glm_ocr_web_split/fp16/glm_ocr_rope_formula.onnx.data",
+    "data_sha256": "2929c9761dd6510fa6734a2ded0bd07d7b8f2705072a0542e76b7ccda9c0f713",
+    "data_bytes": 98304
+  }
+}

manifest.web.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "model_id": "zai-org/GLM-OCR",
+  "dtype": "float16",
+  "opset": 18,
+  "max_seq_len": 2048,
+  "image_size": {
+    "width": 840,
+    "height": 840
+  },
+  "image_token_id": 59280,
+  "eos_token_ids": [
+    59246,
+    59253
+  ],
+  "kv_cache": {
+    "num_layers": 16,
+    "num_key_value_heads": 8,
+    "head_dim": 96,
+    "max_cache_len": 2048
+  },
+  "prompt_profile": "document",
+  "base_url": "./",
+  "graphs": {
+    "vision": "fp16/glm_ocr_vision.onnx",
+    "embed": "fp16/glm_ocr_embed.onnx",
+    "rope": "fp16/glm_ocr_rope_document.onnx",
+    "decode_prefill_kv": "fp16/glm_ocr_decode_prefill_kv.onnx",
+    "decode_step_kv": "fp16/glm_ocr_decode_step_kv.onnx",
+    "vision_quant": "quant/glm_ocr_vision_quant.onnx"
+  },
+  "runtime": {
+    "webgpu": {
+      "executionProviders": [
+        "webgpu"
+      ],
+      "logSeverityLevel": 3
+    },
+    "wasm": {
+      "executionProviders": [
+        "wasm"
+      ],
+      "logSeverityLevel": 3
+    }
+  },
+  "profiles": {
+    "full_browser_kv": {
+      "description": "All sessions in browser, KV prefill+step decoding.",
+      "graphs": [
+        "vision",
+        "embed",
+        "rope",
+        "decode_prefill_kv",
+        "decode_step_kv"
+      ],
+      "requires_image_embeds_input": false,
+      "estimated_weight_gb": 18.254
+    },
+    "hybrid_server_vision_client_kv": {
+      "description": "Vision/image embedding runs on server; browser runs text decode with KV.",
+      "graphs": [
+        "embed",
+        "rope",
+        "decode_prefill_kv",
+        "decode_step_kv"
+      ],
+      "requires_image_embeds_input": true,
+      "estimated_weight_gb": 4.683
+    },
+    "hybrid_server_vision_client_prefill_only": {
+      "description": "Vision on server; browser uses decode_prefill_kv only (simpler, slower token loop).",
+      "graphs": [
+        "embed",
+        "rope",
+        "decode_prefill_kv"
+      ],
+      "requires_image_embeds_input": true,
+      "estimated_weight_gb": 2.511
+    }
+  },
+  "load_order": [
+    "vision",
+    "vision_quant",
+    "embed",
+    "rope",
+    "decode_prefill_kv",
+    "decode_step_kv"
+  ],
+  "memory_budget_gb": {
+    "full_browser_kv": 18.254,
+    "full_browser_prefill_only": 16.082,
+    "hybrid_server_vision_client_kv": 4.683,
+    "hybrid_server_vision_client_prefill_only": 2.511
+  },
+  "per_graph_size_gb": {
+    "vision": 13.571,
+    "embed": 0.34,
+    "rope_profile": 0.0,
+    "decode_prefill_kv": 2.171,
+    "decode_step_kv": 2.172,
+    "vision_quant": 0.0
+  },
+  "warnings": [
+    "vision is 13.57 GB; likely too large for direct browser startup on most clients.",
+    "decode_prefill_kv is 2.17 GB; likely too large for direct browser startup on most clients.",
+    "decode_step_kv is 2.17 GB; likely too large for direct browser startup on most clients."
+  ],
+  "notes": [
+    "This manifest is for ORT Web (WebGPU/WASM) session wiring.",
+    "Prefer lazy session creation; do not load unused graphs.",
+    "Use rope graph matching the exported prompt profile.",
+    "For browser deployments, hybrid mode is usually required at current model size."
+  ]
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "size": {"shortest_edge": 12544, "longest_edge": 9633792},
+    "do_rescale": true,
+    "patch_size": 14,
+    "temporal_patch_size": 2,
+    "merge_size": 2,
+    "image_mean": [0.48145466, 0.4578275, 0.40821073],
+    "image_std": [0.26862954, 0.26130258, 0.27577711],
+    "image_processor_type": "Glm46VImageProcessor",
+    "processor_class": "Glm46VProcessor"
+}

quant/glm_ocr_vision_quant.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3daf41d962e7878703682f8aed901bc697d1e9652e148c21decef39691750702
+size 767365

quant/glm_ocr_vision_quant.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fdfd0d65846923674dc0ae0c18c30d09dfb4752125fb93c45b68ae6c4fb7b264
+size 490746880

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "backend": "tokenizers",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>",
+    "<|begin_of_audio|>",
+    "<|end_of_audio|>",
+    "<|begin_of_transcription|>",
+    "<|end_of_transcription|>",
+    "<|code_prefix|>",
+    "<|code_middle|>",
+    "<|code_suffix|>",
+    "<think>",
+    "</think>",
+    "<tool_call>",
+    "</tool_call>",
+    "<tool_response>",
+    "</tool_response>",
+    "<arg_key>",
+    "</arg_key>",
+    "<arg_value>",
+    "</arg_value>",
+    "/nothink",
+    "<|begin_of_box|>",
+    "<|end_of_box|>",
+    "<|image|>",
+    "<|video|>"
+  ],
+  "is_local": true,
+  "model_max_length": 655380,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "processor_class": "Glm46VProcessor",
+  "tokenizer_class": "TokenizersBackend"
+}