Duplicate from tiiuae/Falcon-OCR

Browse files

Co-authored-by: Yasser Dahou <yasserDahou@users.noreply.huggingface.co>

Files changed (14) hide show

.eval_results/olmocrbench.yaml +89 -0
.gitattributes +35 -0
README.md +456 -0
attention.py +129 -0
config.json +32 -0
configuration_falcon_ocr.py +65 -0
model.safetensors +3 -0
model_args.json +37 -0
modeling_falcon_ocr.py +845 -0
processing_falcon_ocr.py +423 -0
rope.py +127 -0
special_tokens_map.json +390 -0
tokenizer.json +0 -0
tokenizer_config.json +110 -0

.eval_results/olmocrbench.yaml ADDED Viewed

	@@ -0,0 +1,89 @@

+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: overall
+  value: 80.3
+  source:
+    url: https://huggingface.co/tiiuae/Falcon-OCR
+    name: Falcon-OCR Model Card
+    user: nielsr
+  notes: English-subset only
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: arxiv_math
+  value: 80.5
+  source:
+    url: https://huggingface.co/tiiuae/Falcon-OCR
+    name: Falcon-OCR Model Card
+    user: nielsr
+  notes: English-subset only
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: old_scans_math
+  value: 69.2
+  source:
+    url: https://huggingface.co/tiiuae/Falcon-OCR
+    name: Falcon-OCR Model Card
+    user: nielsr
+  notes: English-subset only
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: table_tests
+  value: 90.3
+  source:
+    url: https://huggingface.co/tiiuae/Falcon-OCR
+    name: Falcon-OCR Model Card
+    user: nielsr
+  notes: English-subset only
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: old_scans
+  value: 43.5
+  source:
+    url: https://huggingface.co/tiiuae/Falcon-OCR
+    name: Falcon-OCR Model Card
+    user: nielsr
+  notes: English-subset only
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: headers_footers
+  value: 94.0
+  source:
+    url: https://huggingface.co/tiiuae/Falcon-OCR
+    name: Falcon-OCR Model Card
+    user: nielsr
+  notes: English-subset only
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: multi_column
+  value: 87.1
+  source:
+    url: https://huggingface.co/tiiuae/Falcon-OCR
+    name: Falcon-OCR Model Card
+    user: nielsr
+  notes: English-subset only
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: long_tiny_text
+  value: 78.5
+  source:
+    url: https://huggingface.co/tiiuae/Falcon-OCR
+    name: Falcon-OCR Model Card
+    user: nielsr
+  notes: English-subset only
+- dataset:
+    id: allenai/olmOCR-bench
+    task_id: baseline
+  value: 99.5
+  source:
+    url: https://huggingface.co/tiiuae/Falcon-OCR
+    name: Falcon-OCR Model Card
+    user: nielsr
+  notes: English-subset only

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,456 @@

+---
+pipeline_tag: image-to-text
+library_name: transformers
+tags:
+- falcon
+- ocr
+- vision-language
+- document-understanding
+---
+<div style="width: 480px; text-align: left;">
+  <img src="https://cdn-uploads.huggingface.co/production/uploads/663c9939c1b4f7297c4ae6f6/YIuxzgDiV5T2ZuSB4bam9.png" alt="Falcon OCR Logo" style="max-width: 100%; height: auto;">
+</div>
+# Falcon OCR
+Falcon OCR is a 300M parameter early-fusion vision-language model for document OCR. Given an image, it can produce plain text, LaTeX for formulas, or HTML for tables, depending on the requested output format.
+Most OCR VLM systems are built as a pipeline with a vision encoder feeding a separate text decoder, plus additional task-specific glue. Falcon OCR takes a different approach: a single Transformer processes image patches and text tokens in a shared parameter space from the first layer, using a hybrid attention mask where image tokens attend bidirectionally and text tokens decode causally conditioned on the image.
+We built it this way for two practical reasons. First, it keeps the interface simple: one backbone, one decoding path, and task switching through prompts rather than a growing set of modules. Second, a 0.3B model has a lower latency and cost footprint than 0.9B-class OCR VLMs, and in our vLLM-based serving setup this translates into higher throughput, often 2–3× faster depending on sequence lengths and batch configuration. To our knowledge, this is one of the first attempts to apply this early-fusion single-stack recipe directly to competitive document OCR at this scale.
+### Links
+- Code and inference engine: [https://github.com/tiiuae/Falcon-Perception](https://ghcr.io/tiiuae/falcon-ocr:latest)
+- Tech report: arXiv link coming soon
+- Perception model: `tiiuae/falcon-perception`
+- vLLM/Docker: [https://ghcr.io/tiiuae/falcon-ocr:latest](https://ghcr.io/tiiuae/falcon-ocr:latest)
+## Quickstart
+### Installation
+```bash
+pip install "torch>=2.5" transformers pillow einops
+```
+Falcon OCR requires PyTorch 2.5 or newer for FlexAttention. The first call may be slower as `torch.compile` builds optimized kernels.
+### Single-Image OCR
+```python
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(
+    "tiiuae/Falcon-OCR",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+image = Image.open("document.png")
+texts = model.generate(image)  # default category is "plain"
+print(texts[0])
+```
+### Choose an output format with `category`
+```python
+texts = model.generate(image, category="text")     # plain text
+texts = model.generate(image, category="formula")  # LaTeX
+texts = model.generate(image, category="table")    # HTML table
+```
+## API
+### `model.generate(images, category="plain", **kwargs)`
+- **Inputs**:
+  - `images`: a `PIL.Image.Image` or a list of images
+  - `category`: one of `plain`, `text`, `table`, `formula`, `caption`, `footnote`, `list-item`, `page-footer`, `page-header`, `section-header`, `title`
+- **Returns**: `list[str]`, one extracted string per image
+## Layout OCR (Two-Stage Pipeline)
+For sparse documents, running OCR on the whole image can work well. For dense documents with heterogeneous regions (multi-column layouts, interleaved tables and formulas, small captions), we provide an optional two-stage pipeline:
+1. A layout detector finds regions on the page.
+2. Falcon OCR runs independently on each crop with a category-specific prompt.
+We use [PP-DocLayoutV3](https://huggingface.co/PaddlePaddle/PP-DocLayoutV3_safetensors) as the layout detector.
+```python
+results = model.generate_with_layout(image)
+for det in results[0]:
+    print(f"[{det['category']}] {det['text'][:100]}...")
+```
+Batch mode:
+```python
+results = model.generate_with_layout(
+    [Image.open("page1.png"), Image.open("page2.png")],
+    ocr_batch_size=32,
+)
+```
+The layout model is loaded lazily on the first `generate_with_layout()` call and runs on the same GPU as the OCR model.
+**Returns**: `list[list[dict]]`, one list per image, in reading order:
+```python
+{
+    "category": "text",       # layout category
+    "bbox": [x1, y1, x2, y2], # in original image pixels
+    "score": 0.93,            # detection confidence
+    "text": "..."             # extracted text
+}
+```
+## When to Use What
+| Mode | Best for | How |
+|------|----------|-----|
+| **Plain OCR** | Simple documents, real-world photos, slides, receipts, invoices | `model.generate(image)` |
+| **Layout + OCR** | Complex multi-column documents, academic papers, reports, dense pages like newspapers | `model.generate_with_layout(image)` |
+## Benchmark Results
+<details name="benchmarks" open>
+<summary><b>olmOCR Benchmark</b></summary>
+Category-wise performance comparison of FalconOCR against state-of-the-art OCR models. We report accuracy (%) across all category splits.
+<table>
+<tr><th>Model</th><th>Average</th><th>ArXiv Math</th><th>Base</th><th>Hdr/Ftr</th><th>TinyTxt</th><th>MultCol</th><th>OldScan</th><th>OldMath</th><th>Tables</th></tr>
+<tr><td>Mistral OCR 3</td><td>81.7</td><td><b>85.4</b></td><td><b>99.9</b></td><td>93.8</td><td>88.9</td><td>82.1</td><td>48.8</td><td>68.3</td><td>86.1</td></tr>
+<tr><td>Chandra</td><td><b>82.0</b></td><td>81.4</td><td>99.8</td><td>88.8</td><td><b>91.9</b></td><td>82.9</td><td><b>49.2</b></td><td>73.6</td><td>88.2</td></tr>
+<tr><td>Gemini 3 Pro</td><td>80.2</td><td>70.6</td><td>99.8</td><td>84.0</td><td>90.3</td><td>79.2</td><td>47.5</td><td>84.9</td><td>84.9</td></tr>
+<tr><td>PaddleOCR VL 1.5</td><td>79.3</td><td><b>85.4</b></td><td>98.8</td><td><b>96.9</b></td><td>80.8</td><td>82.6</td><td>39.2</td><td>66.4</td><td>84.1</td></tr>
+<tr><td>PaddleOCR VL</td><td>79.2</td><td><b>85.4</b></td><td>98.6</td><td><b>96.9</b></td><td>80.8</td><td>82.5</td><td>38.8</td><td>66.4</td><td>83.9</td></tr>
+<tr><td>DeepSeek OCR v2</td><td>78.8</td><td>81.9</td><td>99.8</td><td>95.6</td><td>88.7</td><td>83.6</td><td>33.7</td><td>68.8</td><td>78.1</td></tr>
+<tr><td>Gemini 3 Flash</td><td>77.5</td><td>66.5</td><td>99.8</td><td>83.8</td><td>88.2</td><td>73.7</td><td>46.0</td><td><b>85.8</b></td><td>75.9</td></tr>
+<tr><td>GPT 5.2</td><td>69.8</td><td>61.0</td><td>99.8</td><td>75.6</td><td>62.2</td><td>70.2</td><td>34.6</td><td>75.8</td><td>79.0</td></tr>
+<tr style="background:#a358e5; color:white"><td><b>FalconOCR</b></td><td>80.3</td><td>80.5</td><td>99.5</td><td>94.0</td><td>78.5</td><td><b>87.1</b></td><td>43.5</td><td>69.2</td><td><b>90.3</b></td></tr>
+</table>
+</details>
+<details name="benchmarks">
+<summary><b>OmniDocBench</b></summary>
+Performance comparison on full-page document parsing. Overall↑ aggregates the three sub-metrics. Edit↓ measures text edit distance (lower is better). CDM↑ evaluates formula recognition accuracy. TEDS↑ measures table structure similarity.
+<table>
+<tr><th>Model</th><th>Overall↑</th><th>Edit↓</th><th>CDM↑</th><th>TEDS↑</th></tr>
+<tr><td>PaddleOCR VL 1.5</td><td><b>94.37</b></td><td>0.025</td><td><b>94.4</b></td><td><b>91.1</b></td></tr>
+<tr><td>PaddleOCR VL</td><td>91.76</td><td><b>0.024</b></td><td>91.7</td><td>85.9</td></tr>
+<tr><td>Chandra</td><td>88.97</td><td>0.046</td><td>88.1</td><td>89.5</td></tr>
+<tr><td>DeepSeek OCR v2</td><td>87.66</td><td>0.037</td><td>89.2</td><td>77.5</td></tr>
+<tr><td>GPT 5.2</td><td>86.56</td><td>0.061</td><td>88.0</td><td>77.7</td></tr>
+<tr><td>Mistral OCR 3</td><td>85.20</td><td>0.053</td><td>84.3</td><td>76.1</td></tr>
+<tr style="background:#a358e5; color:white"><td><b>FalconOCR</b></td><td>88.64</td><td>0.055</td><td>86.8</td><td>84.6</td></tr>
+</table>
+</details>
+### Results Analysis
+First, a compact model can be competitive when the interface is simple and the training signal is targeted. On olmOCR, Falcon OCR performs strongly on multi-column documents and tables, and is competitive overall against substantially larger systems. Second, evaluation on full-page parsing is sensitive to matching and representation details. On OmniDocBench, the table and formula metrics depend not only on recognition quality but also on how predicted elements are matched to ground truth and how output structure is normalized.
+More broadly, these results suggest that an early-fusion single-stack Transformer can be a viable alternative to the common "vision encoder plus text decoder" recipe for OCR. We do not view this as a finished answer, but as a promising direction: one early-fusion backbone, a shared parameter space between text and images, a single decoding interface, and better data and training signals, rather than increasingly complex pipelines. To our knowledge, this is among the first demonstrations that this early-fusion recipe can reach competitive document OCR accuracy at this scale, and we hope it encourages further work in this direction.
+## Serving Throughput
+Measured on a single A100-80GB GPU with vLLM, processing document images from olmOCR-Bench under high concurrency for optimal vLLM utilization.
+<!-- We benchmark two modes to isolate different parts of the pipeline: -->
+<!-- - **Cropped regions** — A layout detector is run offline first to extract all regions from every page. Only the resulting crops are sent to the VLLM . This measures pure VLLM throughput with no layout overhead. -->
+- **Layout + OCR** — The full end-to-end pipeline: layout detection finds regions on each page, crops them, and vLLM runs OCR on every crop. This represents the real-world serving throughput, inclusive of both layout detection and OCR time.
+| Mode | tok/s | img/s | Description |
+|------|------:|------:|-------------|
+| **Layout + OCR** | 5,825 | 2.9 | Full pipeline: layout detection → crop → per-region OCR |
+<!-- | **Plain OCR** | 6,076 | 43.7 | plain OCR, no layout step | -->
+At 0.3B parameters, Falcon OCR is roughly 3× smaller than 0.9B-class OCR VLMs (e.g., PaddleOCR VL), which translates directly into higher serving throughput at competitive accuracy.
+## Limitations
+- **Old scans and tiny text**: Heavily degraded scans and very small glyphs remain challenging. These cases often require higher effective resolution and better coverage in the training mixture.
+- **Non-unique table representations**: Visually identical tables can be encoded in structurally different HTML forms, which can affect tree-based metrics.
+- **Formula matching sensitivity**: LaTeX and Unicode conventions can be penalized differently depending on the benchmark normalization and matching pipeline.
+## Examples
+*Click each section below to expand.*
+<details name="ocr-examples" open>
+<summary><b>Handwriting and Real World Images</b></summary>
+<p align="center">
+  <img src="https://cdn-uploads.huggingface.co/production/uploads/62fe441427c98b09b503a4e3/51Fj1wxxtAV_jwubml6sa.png" width="600" alt="Handwriting and real world OCR examples" />
+</p>
+</details>
+<details name="ocr-examples">
+<summary><b>Tables</b></summary>
+<p align="center">
+  <img src="https://cdn-uploads.huggingface.co/production/uploads/62fe441427c98b09b503a4e3/2yZjZJAEHVVpd_jfyyDcQ.png" width="600" alt="Table OCR examples" />
+</p>
+</details>
+<details name="ocr-examples">
+<summary><b>Formulas</b></summary>
+<p align="center">
+  <img src="https://cdn-uploads.huggingface.co/production/uploads/62fe441427c98b09b503a4e3/__XMb0GyGO02IPKlQsPQx.png" width="600" alt="Formula OCR examples" />
+</p>
+</details>
+<details name="ocr-examples">
+<summary><b>Complex Layout</b></summary>
+<p align="center">
+  <img src="https://cdn-uploads.huggingface.co/production/uploads/62fe441427c98b09b503a4e3/kTR7nI7ogEqdI1SQtXpTu.png" width="600" alt="Complex layout OCR examples" />
+</p>
+</details>
+---
+## vLLM Server
+We also provide a Docker-based vLLM-backed inference server capable of serving approximately 6,000 tokens per second.
+Single Docker image with two services:
+| Service | Default Port | Description |
+|---------|-------------|-------------|
+| **vLLM** | 8000 | Falcon-OCR vision-language model (OpenAI-compatible API) |
+| **Pipeline** | 5002 | Full document parsing: layout detection → crop → OCR → markdown |
+The layout model runs inside the pipeline process — it is not a standalone service.
+### Quick Start
+```bash
+docker run -d --name falcon-ocr \
+  --gpus '"device=0,1"' \
+  -e EXPOSED_GPU_IDS=0,1 \
+  -e VLLM_GPU=0 \
+  -e PIPELINE_GPU=1 \
+  -e VLLM_GPU_MEM_UTIL=0.90 \
+  -p 8000:8000 \
+  -p 5002:5002 \
+  ghcr.io/tiiuae/falcon-ocr:latest
+```
+### API
+<details name="api" open>
+<summary><b>Health Checks</b></summary>
+```bash
+curl http://localhost:8000/health      # vLLM
+curl http://localhost:5002/health      # Pipeline
+```
+</details>
+<details name="api">
+<summary><b>Upload</b> (multipart file upload — images and PDFs)</summary>
+The easiest way to send files. Supports images and multi-page PDFs:
+```bash
+# Single image
+curl -X POST http://localhost:5002/falconocr/upload \
+  -F "files=@photo.jpg;type=image/jpeg"
+# PDF document
+curl -X POST http://localhost:5002/falconocr/upload \
+  -F "files=@document.pdf;type=application/pdf"
+```
+</details>
+<details name="api">
+<summary><b>Parse</b> (full pipeline: layout + OCR)</summary>
+Send base64-encoded images for layout detection, cropping, and OCR:
+```bash
+curl -X POST http://localhost:5002/falconocr/parse \
+  -H "Content-Type: application/json" \
+  -d '{
+    "images": ["data:image/jpeg;base64,<...>"],
+    "skip_layout": false
+  }'
+```
+Response:
+```json
+{
+  "json_result": [[{
+    "index": 0,
+    "mapped_label": "text",
+    "content": "The Manuscript",
+    "bbox": [273, 273, 937, 380],
+    "score": 0.3145
+  }]],
+  "markdown_result": "The Manuscript",
+  "total_output_tokens": 93,
+  "processing_time_ms": 414
+}
+```
+</details>
+<details name="api">
+<summary><b>Parse</b> (direct VLM, no layout)</summary>
+Skip layout detection and send the full image directly to the VLM:
+```bash
+curl -X POST http://localhost:5002/falconocr/parse \
+  -H "Content-Type: application/json" \
+  -d '{
+    "images": ["data:image/jpeg;base64,<...>"],
+    "skip_layout": true
+  }'
+```
+</details>
+<details name="api">
+<summary><b>Direct vLLM</b> (OpenAI-compatible)</summary>
+```bash
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "falcon-ocr",
+    "messages": [{"role": "user", "content": [
+      {"type": "image_url", "image_url": {"url": "data:image/png;base64,<...>"}},
+      {"type": "text", "text": "Extract the text content from this image.\n<|OCR_PLAIN|>"}
+    ]}],
+    "max_tokens": 2048
+  }'
+```
+</details>
+### Configuration
+All settings are controlled via environment variables at `docker run` time.
+<details name="config" open>
+<summary><b>GPU Assignment</b></summary>
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `VLLM_GPU` | `0` | Host GPU ID for the vLLM process |
+| `PIPELINE_GPU` | `0` | Host GPU ID for the pipeline (layout model) |
+| `EXPOSED_GPU_IDS` | *(all visible)* | Comma-separated host GPU IDs passed via `--gpus` (for index remapping) |
+</details>
+<details name="config">
+<summary><b>Port Assignment</b></summary>
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `VLLM_PORT` | `8000` | Port for the vLLM OpenAI-compatible API |
+| `PIPELINE_PORT` | `5002` | Port for the pipeline API |
+</details>
+<details name="config">
+<summary><b>vLLM Tuning</b></summary>
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `VLLM_GPU_MEM_UTIL` | `0.90` | Fraction of GPU memory vLLM can use |
+| `MAX_NUM_SEQS` | `2048` | Max concurrent sequences in vLLM |
+| `MAX_MODEL_LEN` | `8192` | Max model context length |
+| `DTYPE` | `bfloat16` | Model dtype |
+| `MAX_NUM_BATCHED_TOKENS` | *(auto)* | Max batched tokens per iteration |
+| `CHUNKED_PREFILL` | `false` | Enable chunked prefill |
+</details>
+<details name="config">
+<summary><b>Layout Model Tuning</b></summary>
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LAYOUT_BATCH_SIZE` | `64` | Batch size for layout detection inference |
+</details>
+<details name="config">
+<summary><b>Model Paths</b></summary>
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `FALCON_OCR_MODEL` | `/models/Falcon-OCR` | Path to Falcon-OCR VLM weights (inside container) |
+| `SERVED_MODEL_NAME` | `falcon-ocr` | Model name exposed by vLLM API |
+</details>
+### Deployment Modes
+<details name="deploy" open>
+<summary><b>Two GPUs</b> (best throughput)</summary>
+vLLM on one GPU, layout model on another — zero GPU contention:
+```bash
+docker run -d --name falcon-ocr \
+  --gpus '"device=3,4"' \
+  -e EXPOSED_GPU_IDS=3,4 \
+  -e VLLM_GPU=3 \
+  -e PIPELINE_GPU=4 \
+  -e VLLM_GPU_MEM_UTIL=0.90 \
+  -p 8000:8000 \
+  -p 5002:5002 \
+  ghcr.io/tiiuae/falcon-ocr:latest
+```
+</details>
+<details name="deploy">
+<summary><b>Single GPU</b> (memory sharing)</summary>
+Both services share one GPU — tune `VLLM_GPU_MEM_UTIL` to leave room for the layout model:
+```bash
+docker run -d --name falcon-ocr \
+  --gpus '"device=0"' \
+  -e EXPOSED_GPU_IDS=0 \
+  -e VLLM_GPU=0 \
+  -e PIPELINE_GPU=0 \
+  -e VLLM_GPU_MEM_UTIL=0.55 \
+  -e LAYOUT_BATCH_SIZE=32 \
+  -e MAX_NUM_SEQS=512 \
+  -p 8000:8000 \
+  -p 5002:5002 \
+  ghcr.io/tiiuae/falcon-ocr:latest
+```
+</details>
+<details name="deploy">
+<summary><b>Custom Ports</b></summary>
+```bash
+docker run -d --name falcon-ocr \
+  --gpus '"device=0,1"' \
+  -e EXPOSED_GPU_IDS=0,1 \
+  -e VLLM_GPU=0 \
+  -e PIPELINE_GPU=1 \
+  -e VLLM_PORT=18000 \
+  -e PIPELINE_PORT=15002 \
+  -p 18000:18000 \
+  -p 15002:15002 \
+  ghcr.io/tiiuae/falcon-ocr:latest
+```
+Docker `--gpus "device=3,4"` makes the container see GPUs as local indices `0,1`.
+`EXPOSED_GPU_IDS=3,4` allows you to reference host GPU IDs (`VLLM_GPU=3`, `PIPELINE_GPU=4`);
+the entrypoint remaps them to the correct container-local indices.
+</details>
+## Citation
+If you use Falcon OCR, please cite:
+```bibtex
+@misc{falconocr2026,
+  title        = {Falcon OCR},
+  author       = {TII Falcon Vision Team},
+  year         = {2026},
+  howpublished = {arXiv preprint, link forthcoming},
+  note         = {Code: https://github.com/tiiuae/Falcon-Perception},
+}
+```

attention.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+from torch import Tensor as T
+from torch.nn.attention.flex_attention import (
+    BlockMask,
+    _mask_mod_signature,
+    and_masks,
+    create_block_mask,
+    flex_attention,
+    or_masks,
+)
+# ---------------------------------------------------------------------------
+# Two compiled variants of flex_attention
+# ---------------------------------------------------------------------------
+# _decode:  fullgraph=True, static shapes.
+#           Used for decode steps (S_q == 1) where shapes are fixed and
+#           the call will be captured inside a CUDA graph.  fullgraph=True
+#           avoids graph breaks that would corrupt the capture.
+#
+# _prefill: dynamic=True, symbolic shapes.
+#           Used for prefill steps (S_q > 1) where the sequence length
+#           varies per image.  dynamic=True lets one compiled graph handle
+#           all lengths without recompilation.  Prefill is never inside a
+#           CUDA graph, so symbolic shape guards are fine.
+compiled_flex_attn_decode = torch.compile(flex_attention, fullgraph=True)
+compiled_flex_attn_prefill = torch.compile(flex_attention, dynamic=True)
+def offset_mask_mod(mask_mod: _mask_mod_signature, offset: int):
+    """Get a mask mod function with an offset applied to the query positions."""
+    def _mask_mod(b, h, q, kv):
+        return mask_mod(b, h, q + offset, kv)
+    return _mask_mod
+def get_causal_mask_mod() -> _mask_mod_signature:
+    """Causal mask that prevents attention to future tokens."""
+    def _causal_mask(b: T, h: T, q_idx: T, kv_idx: T) -> T:
+        return q_idx >= kv_idx
+    return _causal_mask
+def get_document_mask_mod(batch: T, eos_id: int) -> _mask_mod_signature:
+    """Document mask: prevents attention across document boundaries (token IDs [B, S])."""
+    eos_mask = batch == eos_id
+    eos_mask[:, -1] = True
+    cumulative_mask = torch.cumsum(torch.where(eos_mask, 1, 0), dim=1)
+    sequence_indices = torch.zeros_like(cumulative_mask, dtype=torch.int32)
+    sequence_indices[:, 1:] = cumulative_mask[:, :-1]
+    def document_mask(b: T, h: T, q_idx: T, kv_idx: T) -> T:
+        return sequence_indices[b, q_idx] == sequence_indices[b, kv_idx]
+    return document_mask
+def get_non_left_pad_mask_mod(batch: T, pad_id: int) -> _mask_mod_signature:
+    """Prevent model from attending to the left-padded token required for correct batch inference."""
+    non_pad_mask_id = torch.cumsum(batch != pad_id, dim=1)
+    # Left-most pad tokens have cumulative id == 0.
+    def mask_mod(b, h, q_idx, kv_idx):
+        return non_pad_mask_id[b, kv_idx] > 0
+    return mask_mod
+def get_image_prefix_mask_mod(
+    batch: T, soi_id: int, eoi_id: int
+) -> _mask_mod_signature:
+    """Image-prefix mask: tokens between SOI and EOI attend only within same image."""
+    soi_mask = batch == soi_id
+    eoi_mask = batch == eoi_id
+    acc_soi_mask = torch.cumsum(soi_mask, dim=1)
+    acc_eoi_mask = torch.cumsum(eoi_mask, dim=1)
+    img_mask = (acc_soi_mask - acc_eoi_mask) > 0
+    img_indices = acc_soi_mask * img_mask
+    def image_prefix_mask_mod(b, h, q_idx, kv_idx):
+        is_img_tokens = img_mask[b, q_idx] & img_mask[b, kv_idx]
+        is_same_image = img_indices[b, q_idx] == img_indices[b, kv_idx]
+        return is_img_tokens & is_same_image
+    return image_prefix_mask_mod
+_compiled_create_block_mask = torch.compile(
+    create_block_mask, dynamic=True
+)  # reduce-overhead mode breaks manual CUDA graph capture (private streams)
+@torch.inference_mode()
+def create_attention_mask(*args, **kwargs) -> BlockMask:
+    """Compiled for large masks; inference_mode avoids grad_mode recompiles."""
+    return _compiled_create_block_mask(*args, **kwargs)
+def create_batch_attention_mask(
+    input_batch: T,
+    *,
+    pad_token_id: int,
+    eos_token_id: int,
+    soi_token_id: int,
+    eoi_token_id: int,
+    max_len: int | None = None,
+) -> BlockMask:
+    """Build the combined FlexAttention mask for the batch engine.
+    Composes causal + document + non-left-pad + image-prefix masks.
+    """
+    B, S = input_batch.size()
+    block_causal_mask_mod = and_masks(
+        get_causal_mask_mod(),
+        get_document_mask_mod(input_batch, eos_token_id),
+        get_non_left_pad_mask_mod(input_batch, pad_token_id),
+    )
+    image_prefix_mask_mod = get_image_prefix_mask_mod(
+        batch=input_batch,
+        soi_id=soi_token_id,
+        eoi_id=eoi_token_id,
+    )
+    mask_mod = or_masks(image_prefix_mask_mod, block_causal_mask_mod)
+    max_len = max_len or S
+    return create_attention_mask(mask_mod, B, None, max_len, max_len)

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "FalconOCRForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_falcon_ocr.FalconOCRConfig",
+    "AutoModelForCausalLM": "modeling_falcon_ocr.FalconOCRForCausalLM"
+  },
+  "model_type": "falcon_ocr",
+  "torch_dtype": "float32",
+  "dim": 768,
+  "n_layers": 22,
+  "n_heads": 16,
+  "head_dim": 64,
+  "n_kv_heads": 8,
+  "vocab_size": 65536,
+  "ffn_dim": 2304,
+  "norm_eps": 1e-05,
+  "max_seq_len": 8192,
+  "rope_theta": 10000,
+  "channel_size": 3,
+  "spatial_patch_size": 16,
+  "temporal_patch_size": 1,
+  "eos_id": 11,
+  "img_id": 227,
+  "image_cls_token_id": 244,
+  "image_reg_1_token_id": 245,
+  "image_reg_2_token_id": 246,
+  "image_reg_3_token_id": 247,
+  "image_reg_4_token_id": 248,
+  "img_end_id": 230
+}

configuration_falcon_ocr.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from transformers import PretrainedConfig
+class FalconOCRConfig(PretrainedConfig):
+    model_type = "falcon_ocr"
+    def __init__(
+        self,
+        dim: int = 768,
+        n_layers: int = 22,
+        n_heads: int = 16,
+        head_dim: int = 64,
+        n_kv_heads: int = 8,
+        vocab_size: int = 65536,
+        ffn_dim: int = 2304,
+        norm_eps: float = 1e-5,
+        max_seq_len: int = 8192,
+        rope_theta: int = 10000,
+        channel_size: int = 3,
+        spatial_patch_size: int = 16,
+        temporal_patch_size: int = 1,
+        img_id: int = 227,
+        eos_id: int = 11,
+        image_cls_token_id: int = 244,
+        image_mask_token_id: int = 243,
+        image_reg_1_token_id: int = 245,
+        image_reg_2_token_id: int = 246,
+        image_reg_3_token_id: int = 247,
+        image_reg_4_token_id: int = 248,
+        img_start_id: int = 229,
+        img_end_id: int = 230,
+        img_row_sep_id: int = 228,
+        vid_start_id: int = 231,
+        vid_end_id: int = 232,
+        frame_sep_id: int = 233,
+        **kwargs,
+    ):
+        self.dim = dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.n_kv_heads = n_kv_heads
+        self.vocab_size = vocab_size
+        self.ffn_dim = ffn_dim
+        self.norm_eps = norm_eps
+        self.max_seq_len = max_seq_len
+        self.rope_theta = rope_theta
+        self.channel_size = channel_size
+        self.spatial_patch_size = spatial_patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.img_id = img_id
+        self.eos_id = eos_id
+        self.image_cls_token_id = image_cls_token_id
+        self.image_mask_token_id = image_mask_token_id
+        self.image_reg_1_token_id = image_reg_1_token_id
+        self.image_reg_2_token_id = image_reg_2_token_id
+        self.image_reg_3_token_id = image_reg_3_token_id
+        self.image_reg_4_token_id = image_reg_4_token_id
+        self.img_start_id = img_start_id
+        self.img_end_id = img_end_id
+        self.img_row_sep_id = img_row_sep_id
+        self.vid_start_id = vid_start_id
+        self.vid_end_id = vid_end_id
+        self.frame_sep_id = frame_sep_id
+        super().__init__(**kwargs)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e7f73a508c050f3a4f0b5ce196dba36db896626e44c4f8976d3a3e3c18ceb2a
+size 1079789440

model_args.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "channel_size": 3,
+  "coord_dec_dim": 8192,
+  "coord_enc_dim": 512,
+  "coord_out_dim": 2048,
+  "coord_token_id": 240,
+  "dim": 768,
+  "eos_id": 11,
+  "ffn_dim": 2304,
+  "head_dim": 64,
+  "image_cls_token_id": 244,
+  "image_reg_1_token_id": 245,
+  "image_reg_2_token_id": 246,
+  "image_reg_3_token_id": 247,
+  "image_reg_4_token_id": 248,
+  "img_end_id": 230,
+  "img_id": 227,
+  "img_row_sep_id": 228,
+  "img_start_id": 229,
+  "max_seq_len": 8192,
+  "n_heads": 16,
+  "n_kv_heads": 8,
+  "n_layers": 22,
+  "norm_eps": 1e-05,
+  "num_segm_layers": 3,
+  "perception_heads": false,
+  "rope_theta": 10000,
+  "seg_token_id": 262,
+  "segm_out_dim": 256,
+  "size_dec_dim": 8192,
+  "size_enc_dim": 512,
+  "size_out_dim": 2048,
+  "size_token_id": 241,
+  "spatial_patch_size": 16,
+  "temporal_patch_size": 1,
+  "vocab_size": 65536
+}

modeling_falcon_ocr.py ADDED Viewed

	@@ -0,0 +1,845 @@

+from pathlib import Path
+import einops as E
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from PIL import Image
+from torch import Tensor as T
+from torch import nn
+from torch.nn.attention.flex_attention import (
+    AuxRequest,
+    BlockMask,
+)
+from transformers import AutoTokenizer, PreTrainedModel
+from .attention import (
+    compiled_flex_attn_decode,
+    compiled_flex_attn_prefill,
+    create_batch_attention_mask,
+    offset_mask_mod,
+)
+from .configuration_falcon_ocr import FalconOCRConfig
+from .processing_falcon_ocr import load_image, process_batch
+from .rope import (
+    apply_3d_rotary_emb,
+    apply_golden_freqs_cis_to_visual_pos,
+    precompute_freqs_cis,
+)
+CATEGORY_PROMPTS = {
+    "plain": "Extract the text content from this image.",
+    "formula": "Extract the formula content from this image.",
+    "table": "Extract the table content from this image.",
+    "text": "Extract the text content from this image.",
+    "caption": "Extract the caption content from this image.",
+    "footnote": "Extract the footnote content from this image.",
+    "list-item": "Extract the list-item content from this image.",
+    "page-footer": "Extract the page-footer content from this image.",
+    "page-header": "Extract the page-header content from this image.",
+    "section-header": "Extract the section-header content from this image.",
+    "title": "Extract the title content from this image.",
+}
+LAYOUT_TO_OCR_CATEGORY: dict[str, str | None] = {
+    "text": "text",
+    "table": "table",
+    "formula": "formula",
+    "caption": "caption",
+    "footnote": "footnote",
+    "list-item": "list-item",
+    "title": "title",
+    "header": "text",
+    "footer": "page-footer",
+    "number": "text",
+    "figure_title": "caption",
+    "paragraph_title": "section-header",
+    "doc_title": "title",
+    "reference_content": "text",
+    "reference": "text",
+    "abstract": "text",
+    "aside_text": "text",
+    "content": "text",
+    "formula_number": "text",
+    "vision_footnote": "footnote",
+    "algorithm": "text",
+    "page-footer": "page-footer",
+    "page-header": "page-header",
+    "section-header": "section-header",
+    # Skip — no text to extract
+    "image": None,
+    "picture": None,
+    "figure": None,
+    "chart": None,
+    "seal": None,
+}
+_LAYOUT_TARGET_H, _LAYOUT_TARGET_W = 800, 800
+_MIN_CROP_DIM = 16
+def _box_area(bbox):
+    return max(0, bbox[2] - bbox[0]) * max(0, bbox[3] - bbox[1])
+def _intersection_area(a, b):
+    return max(0, min(a[2], b[2]) - max(a[0], b[0])) * max(0, min(a[3], b[3]) - max(a[1], b[1]))
+def _containment_ratio(small, large):
+    area = _box_area(small)
+    if area <= 0:
+        return 0.0
+    return _intersection_area(small, large) / area
+def _filter_nested_detections(detections: list[dict], containment_threshold: float = 0.8) -> list[dict]:
+    """Remove any box that is mostly contained within a strictly larger box."""
+    areas = [_box_area(d["bbox"]) for d in detections]
+    keep = []
+    for i, det in enumerate(detections):
+        is_nested = False
+        for j, other in enumerate(detections):
+            if i == j:
+                continue
+            if areas[j] <= areas[i]:
+                continue
+            if _containment_ratio(det["bbox"], other["bbox"]) > containment_threshold:
+                is_nested = True
+                break
+        if not is_nested:
+            keep.append(det)
+    return keep
+# Attention
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    B, S, H, D = x.shape
+    if n_rep == 1:
+        return x
+    return torch.unsqueeze(x, dim=3).expand(B, S, H, n_rep, D).reshape(B, S, H * n_rep, D)
+class Attention(nn.Module):
+    def __init__(self, config: FalconOCRConfig, layer_id: int):
+        super().__init__()
+        self.layer_id = layer_id
+        self.n_kv_heads = config.n_kv_heads or config.n_heads
+        self.n_rep = config.n_heads // self.n_kv_heads
+        self.head_dim = config.head_dim or config.dim // config.n_heads
+        self.q_dim = config.n_heads * self.head_dim
+        self.kv_dim = self.n_kv_heads * self.head_dim
+        self.wqkv = nn.Linear(config.dim, self.q_dim + 2 * self.kv_dim, bias=False)
+        self.wo = nn.Linear(config.n_heads * self.head_dim, config.dim, bias=False)
+        self.sinks = nn.Parameter(torch.empty((config.n_heads,)))
+    def _pre_attention_qkv(self, x) -> tuple[T, T, T]:
+        qkv = self.wqkv(F.rms_norm(x, (x.size(-1),)))
+        xq, xk, xv = qkv.split([self.q_dim, self.kv_dim, self.kv_dim], dim=-1)
+        xq = E.rearrange(xq, "b s (h d) -> b s h d", d=self.head_dim)
+        xk = E.rearrange(xk, "b s (h d) -> b s h d", d=self.head_dim)
+        xv = E.rearrange(xv, "b s (h d) -> b s h d", d=self.head_dim)
+        xq = F.rms_norm(xq, (xq.size(-1),))
+        xk = F.rms_norm(xk, (xk.size(-1),))
+        xk = repeat_kv(xk, n_rep=self.n_rep)
+        xv = repeat_kv(xv, n_rep=self.n_rep)
+        return xq, xk, xv
+    def _post_attention(self, output: T, lse: T) -> T:
+        # Sink-based scaling: sigmoid(lse - sinks) * output
+        # equivalent to prepending a sink token to the input
+        sinks_BHS = self.sinks.view(1, -1, 1)
+        sink_scale = torch.sigmoid(lse - sinks_BHS)
+        output = (output * sink_scale.unsqueeze(-1)).to(output.dtype)
+        output = output.permute(0, 2, 1, 3).contiguous().flatten(2)
+        return self.wo(output)
+    def compile_attention(self, *, dynamic: bool = True, mode: str = "default"):
+        self._pre_attention_qkv = torch.compile(self._pre_attention_qkv, dynamic=dynamic, mode=mode)
+        self._post_attention = torch.compile(self._post_attention, dynamic=dynamic, mode=mode)
+    def forward(
+        self, x: T, attention_masks: BlockMask, freqs_cis: T,
+        freqs_cis_2d: T | None = None, pos_hw: T | None = None,
+        kv_cache=None, input_pos=None, batch_idx=None,
+        flex_attn_kernel_options=None,
+    ):
+        xq, xk, xv = self._pre_attention_qkv(x)
+        xq, xk = apply_3d_rotary_emb(xq, xk, freqs_cis, freqs_cis_2d, pos_hw)
+        xq = E.rearrange(xq, "b s h d -> b h s d")
+        xk = E.rearrange(xk, "b s h d -> b h s d")
+        xv = E.rearrange(xv, "b s h d -> b h s d")
+        xk, xv = kv_cache.insert_kv(self.layer_id, xk, xv, input_pos=input_pos, batch_idx=batch_idx)
+        flex_fn = compiled_flex_attn_decode if xq.shape[2] == 1 else compiled_flex_attn_prefill
+        output, aux_output = flex_fn(xq, xk, xv, block_mask=attention_masks, return_aux=AuxRequest(lse=True))
+        return self._post_attention(output, aux_output.lse)
+# FeedForward
+@triton.jit
+def _squared_relu_gate_kernel(
+    packed_ptr, out_ptr, n_rows, n_cols,
+    in_row_stride, in_col_stride, out_row_stride, out_col_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    n_elements = n_rows * n_cols
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    rows = offsets // n_cols
+    cols = offsets % n_cols
+    gate_idx = rows * in_row_stride + (2 * cols) * in_col_stride
+    up_idx = rows * in_row_stride + (2 * cols + 1) * in_col_stride
+    out_idx = rows * out_row_stride + cols * out_col_stride
+    gate = tl.load(packed_ptr + gate_idx, mask=mask)
+    up = tl.load(packed_ptr + up_idx, mask=mask)
+    gate = tl.where(gate > 0, gate, 0.0)
+    out = gate * gate * up
+    tl.store(out_ptr + out_idx, out, mask=mask)
+def squared_relu_gate(packed: T, hidden_dim: int) -> T:
+    """Processes interleaved [gate, up, gate, up, ...] from w13; output = ReLU(gate)^2 * up."""
+    packed_2d = packed.flatten(0, -2)
+    n_rows = packed_2d.shape[0]
+    n_cols = hidden_dim
+    out_2d = torch.empty((n_rows, n_cols), device=packed.device, dtype=packed.dtype)
+    n = n_rows * n_cols
+    grid = lambda meta: (triton.cdiv(n, meta["BLOCK_SIZE"]),)
+    _squared_relu_gate_kernel[grid](
+        packed_2d, out_2d, n_rows, n_cols,
+        packed_2d.stride(0), packed_2d.stride(1),
+        out_2d.stride(0), out_2d.stride(1),
+        BLOCK_SIZE=1024,
+    )
+    return out_2d.view(*packed.shape[:-1], hidden_dim)
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        self.w13 = nn.Linear(dim, 2 * hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.hidden_dim = hidden_dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.rms_norm(x, (x.size(-1),))
+        w13_out = self.w13(x)
+        return self.w2(squared_relu_gate(w13_out, self.hidden_dim))
+# TransformerBlock
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, config: FalconOCRConfig):
+        super().__init__()
+        self.attention = Attention(config, layer_id)
+        self.feed_forward = FeedForward(config.dim, config.ffn_dim)
+    def compile(self, *, dynamic: bool = True, mode: str = "default"):
+        self.feed_forward = torch.compile(self.feed_forward, dynamic=dynamic, mode=mode)
+        self.attention.compile_attention(dynamic=dynamic, mode=mode)
+        return self
+    def forward(
+        self, x: T, freqs_cis: T, freqs_cis_2d: T | None = None,
+        pos_hw: T | None = None, attention_masks=None, kv_cache=None,
+        input_pos=None, batch_idx=None, flex_attn_kernel_options=None,
+    ):
+        B, S, D = x.shape
+        x = x + self.attention(
+            x, freqs_cis=freqs_cis, freqs_cis_2d=freqs_cis_2d, pos_hw=pos_hw,
+            attention_masks=attention_masks, kv_cache=kv_cache,
+            input_pos=input_pos, batch_idx=batch_idx,
+            flex_attn_kernel_options=flex_attn_kernel_options,
+        )
+        out = x + self.feed_forward(x)
+        return out.reshape(B, S, D)
+# KV Cache
+class KVCache:
+    def __init__(self, max_batch_size, max_seq_length, n_heads, head_dim, num_layers):
+        self.kv_shape = (num_layers, 2, max_batch_size, n_heads, max_seq_length, head_dim)
+        self.kv_cache = None
+        self.pos = 0
+        self.pos_t: T | None = None
+    def reset(self):
+        self.pos = 0
+        self.pos_t = None
+    def get_pos(self):
+        return self.pos
+    def set_pos_t(self, pos_t):
+        self.pos_t = pos_t
+    def increment_and_get_pos_t(self):
+        assert self.pos_t is not None
+        self.pos_t += 1
+        return self.pos_t
+    def insert_kv(self, layer_id: int, k: T, v: T, **kwargs):
+        del kwargs
+        assert self.pos_t is not None
+        if self.kv_cache is None:
+            self.kv_cache = torch.empty(self.kv_shape, dtype=k.dtype, device=k.device)
+        B, H, T_add, D = k.size()
+        t0, t1 = self.pos, self.pos + T_add
+        self.kv_cache[layer_id, 0, :, :, t0:t1] = k
+        self.kv_cache[layer_id, 1, :, :, t0:t1] = v
+        key_view = self.kv_cache[layer_id, 0, :, :, :t1]
+        value_view = self.kv_cache[layer_id, 1, :, :, :t1]
+        if layer_id == self.kv_cache.size(0) - 1:
+            self.pos = t1
+        return key_view, value_view
+# Sampling
+@torch.inference_mode()
+def sample_next_token(logits, rng, temperature=0.0, top_k=None):
+    assert temperature >= 0.0
+    if temperature == 0.0:
+        return torch.argmax(logits, dim=-1, keepdim=True)
+    if top_k is not None:
+        k = min(top_k, logits.size(-1))
+        vals, idx = torch.topk(logits, k, dim=-1)
+        vals = vals / temperature
+        probs = F.softmax(vals, dim=-1)
+        choice = torch.multinomial(probs, num_samples=1, generator=rng)
+        return idx.gather(1, choice)
+    logits = logits / temperature
+    probs = F.softmax(logits, dim=-1)
+    return torch.multinomial(probs, num_samples=1, generator=rng)
+# Main Model
+class FalconOCRForCausalLM(PreTrainedModel):
+    config_class = FalconOCRConfig
+    _no_split_modules = ["TransformerBlock"]
+    def __init__(self, config: FalconOCRConfig):
+        super().__init__(config)
+        img_in_dim = config.temporal_patch_size * config.spatial_patch_size ** 2 * config.channel_size
+        self.img_projector = nn.Linear(img_in_dim, config.dim, bias=False)
+        self.tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+        self.layers = nn.ModuleDict()
+        for layer_id in range(config.n_layers):
+            self.layers[str(layer_id)] = TransformerBlock(layer_id, config)
+        self.norm = nn.RMSNorm(config.dim, eps=config.norm_eps)
+        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+        rope_dim = config.head_dim // 2
+        freqs_cis = precompute_freqs_cis(rope_dim, config.max_seq_len, config.rope_theta)
+        freqs_cis_golden = torch.empty((config.n_heads, rope_dim // 2, 2), dtype=torch.float)
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        self.register_buffer("freqs_cis_golden", freqs_cis_golden, persistent=True)
+        self._weights_fused = False
+        self._is_compiled = False
+        self.post_init()
+    # Weight management
+    def _ensure_device_buffers(self):
+        """Recompute non-persistent buffers that HF meta-device loading may discard."""
+        if self._weights_fused:
+            return
+        device = self.tok_embeddings.weight.device
+        c = self.config
+        rope_dim = c.head_dim // 2
+        freqs_cis = precompute_freqs_cis(rope_dim, c.max_seq_len, c.rope_theta).to(device)
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        if self.freqs_cis_golden.device != device:
+            self.freqs_cis_golden = self.freqs_cis_golden.to(device)
+        self._weights_fused = True
+    def compile_model(self):
+        if self._is_compiled:
+            return
+        torch._inductor.config.triton.cudagraphs = False
+        for layer in self.layers.values():
+            layer.compile(dynamic=True, mode="default")
+        self._is_compiled = True
+    # Tokenizer
+    def _get_tokenizer(self):
+        if not hasattr(self, "_tokenizer"):
+            import os
+            path = self.config._name_or_path
+            is_local = os.path.exists(path)
+            self._tokenizer = AutoTokenizer.from_pretrained(path, local_files_only=is_local, trust_remote_code=True)
+            for token_name, token in self._tokenizer.special_tokens_map.items():
+                if isinstance(token, str):
+                    setattr(self._tokenizer, token_name, token)
+                    setattr(
+                        self._tokenizer, token_name + "_id",
+                        self._tokenizer.convert_tokens_to_ids(token),
+                    )
+        return self._tokenizer
+    # Attention mask
+    def get_attention_mask(self, input_batch: T, max_len: int | None = None):
+        return create_batch_attention_mask(
+            input_batch,
+            pad_token_id=self._pad_token_id,
+            eos_token_id=self.config.eos_id,
+            soi_token_id=self.config.image_cls_token_id,
+            eoi_token_id=self.config.img_end_id,
+            max_len=max_len,
+        )
+    # Embedding helpers
+    def _scatter_img_tokens_with_projector(self, h_BSD, pixel_patches_NLC, pixel_masks_NTHW, tokens_BS):
+        B, S, D = h_BSD.shape
+        pixel_patch_mask = E.reduce(
+            pixel_masks_NTHW,
+            "n (t pt) (h ph) (w pw) -> (n t h w)",
+            reduction="any",
+            pt=self.config.temporal_patch_size,
+            ph=self.config.spatial_patch_size,
+            pw=self.config.spatial_patch_size,
+        )
+        pixel_patches_flat = E.rearrange(pixel_patches_NLC, "n p c -> (n p) c")
+        valid_patches = pixel_patches_flat[pixel_patch_mask]
+        valid_feats = self.img_projector(valid_patches)
+        img_mask_h_BSD = E.repeat(tokens_BS == self.config.img_id, "b s -> b s d", d=D)
+        assert valid_feats.numel() == img_mask_h_BSD.sum()
+        return torch.masked_scatter(h_BSD, img_mask_h_BSD, valid_feats)
+    # Core forward
+    def forward(
+        self,
+        tokens: T,
+        attention_mask: BlockMask,
+        kv_cache,
+        rope_pos_t: T | None = None,
+        rope_pos_hw: T | None = None,
+        pixel_values: T | None = None,
+        pixel_mask: T | None = None,
+    ):
+        B, S = tokens.size()
+        c = self.config
+        block_mask = attention_mask
+        T_pos = kv_cache.get_pos()
+        is_prefill = S != 1
+        if is_prefill:
+            assert rope_pos_t is not None and rope_pos_hw is not None
+            pos_t = rope_pos_t[:, T_pos:T_pos + S].long()
+            kv_cache.pos_t = pos_t[:, -1:]
+            freqs_cis = self.freqs_cis[pos_t]
+            rope_pos_hw = rope_pos_hw[:, T_pos:T_pos + S]
+            freqs_cis_golden = apply_golden_freqs_cis_to_visual_pos(self.freqs_cis_golden, rope_pos_hw)
+            block_mask.seq_lengths = (S, S)
+        else:
+            pos_t = kv_cache.increment_and_get_pos_t()
+            freqs_cis = self.freqs_cis[pos_t]
+            freqs_cis_golden = None
+            block_idx = T_pos // block_mask.BLOCK_SIZE[0]
+            block_mask = block_mask[:, :, block_idx]
+            block_mask.seq_lengths = (S, T_pos + S)
+            block_mask.mask_mod = offset_mask_mod(attention_mask.mask_mod, offset=T_pos)
+        h_BSD = self.tok_embeddings(tokens)
+        if pixel_values is not None:
+            assert pixel_mask is not None
+            pixel_values = pixel_values.to(self.dtype)
+            pixel_mask = pixel_mask.to(self.dtype)
+            pixel_patches_NLC = E.rearrange(
+                pixel_values,
+                "n (t pt) (h ph) (w pw) c -> n (t h w) (pt ph pw c)",
+                pt=c.temporal_patch_size, ph=c.spatial_patch_size, pw=c.spatial_patch_size,
+            )
+            h_BSD = self._scatter_img_tokens_with_projector(h_BSD, pixel_patches_NLC, pixel_mask, tokens)
+        for layer in self.layers.values():
+            h_BSD = layer(
+                h_BSD, freqs_cis=freqs_cis, freqs_cis_2d=freqs_cis_golden,
+                pos_hw=rope_pos_hw, attention_masks=block_mask, kv_cache=kv_cache,
+            )
+        h_BSD = self.norm(h_BSD)
+        logits_BSV = self.output(h_BSD)
+        return logits_BSV
+    # Layout detection
+    def _load_layout_model(self, layout_model: str = "PaddlePaddle/PP-DocLayoutV3_safetensors"):
+        if hasattr(self, "_layout_model"):
+            return
+        import torchvision.transforms.functional as tvF
+        from transformers import AutoModelForObjectDetection, PPDocLayoutV3ImageProcessorFast
+        self._layout_processor = PPDocLayoutV3ImageProcessorFast.from_pretrained(layout_model)
+        self._layout_det_model = AutoModelForObjectDetection.from_pretrained(
+            layout_model, torch_dtype=torch.float16,
+        ).to(self.device).eval()
+        self._layout_id2label = self._layout_det_model.config.id2label
+        self._tvF = tvF
+    @torch.inference_mode()
+    def _run_layout_detection(
+        self, images: list[Image.Image], threshold: float = 0.5,
+    ) -> list[list[dict]]:
+        """Run PP-DocLayoutV3 on a batch of PIL images, return per-image detections."""
+        device = self.device
+        tvF = self._tvF
+        target_sizes = torch.tensor([img.size[::-1] for img in images])
+        tensors = [tvF.pil_to_tensor(img) for img in images]
+        # GPU-accelerated resize + normalize
+        result = torch.empty(
+            len(tensors), 3, _LAYOUT_TARGET_H, _LAYOUT_TARGET_W,
+            dtype=torch.float16, device=device,
+        )
+        size_groups: dict[tuple[int, int], list[int]] = {}
+        for i, t in enumerate(tensors):
+            size_groups.setdefault((t.shape[1], t.shape[2]), []).append(i)
+        for shape, indices in size_groups.items():
+            batch = torch.stack([tensors[i] for i in indices])
+            batch = batch.to(device=device, dtype=torch.float32, non_blocking=True)
+            batch = F.interpolate(
+                batch, size=(_LAYOUT_TARGET_H, _LAYOUT_TARGET_W),
+                mode="bicubic", align_corners=False, antialias=False,
+            )
+            batch = (batch.clamp_(0, 255) / 255.0).to(torch.float16)
+            for j, idx in enumerate(indices):
+                result[idx] = batch[j]
+            del batch
+        outputs = self._layout_det_model(pixel_values=result)
+        del result
+        # Postprocess on GPU
+        logits = outputs.logits
+        boxes = outputs.pred_boxes
+        order_logits = outputs.order_logits
+        box_centers, box_dims = boxes.split(2, dim=-1)
+        boxes_xyxy = torch.cat([box_centers - 0.5 * box_dims, box_centers + 0.5 * box_dims], dim=-1)
+        img_h, img_w = target_sizes.unbind(1)
+        scale = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(device, dtype=boxes_xyxy.dtype)
+        boxes_xyxy = boxes_xyxy * scale[:, None, :]
+        num_queries = logits.shape[1]
+        num_classes = logits.shape[2]
+        scores = logits.sigmoid()
+        scores_flat, index = scores.flatten(1).topk(num_queries, dim=-1)
+        labels = index % num_classes
+        box_indices = index // num_classes
+        boxes_xyxy = boxes_xyxy.gather(dim=1, index=box_indices.unsqueeze(-1).expand(-1, -1, 4))
+        order_seqs = self._layout_processor._get_order_seqs(order_logits)
+        order_seqs = order_seqs.gather(dim=1, index=box_indices)
+        batch_results = []
+        for s, l, b, o in zip(scores_flat, labels, boxes_xyxy, order_seqs):
+            mask = s >= threshold
+            o_valid = o[mask]
+            _, indices_sorted = o_valid.sort()
+            detections = []
+            for si, li, bi in zip(s[mask][indices_sorted], l[mask][indices_sorted], b[mask][indices_sorted]):
+                detections.append({
+                    "category": self._layout_id2label[li.item()],
+                    "bbox": [round(x, 2) for x in bi.tolist()],
+                    "score": round(si.item(), 4),
+                })
+            batch_results.append(detections)
+        return batch_results
+    # Core batch decode (shared by generate & generate_with_layout)
+    def _generate_batch(
+        self,
+        image_prompt_pairs: list[tuple],
+        *,
+        max_new_tokens: int,
+        temperature: float,
+        top_k: int | None,
+        min_dimension: int,
+        max_dimension: int,
+        seed: int | None,
+    ) -> list[str]:
+        """Core autoregressive decode for a list of (image, prompt) pairs."""
+        device = self.device
+        tokenizer = self._get_tokenizer()
+        self._pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")
+        stop_token_ids = [self.config.eos_id, tokenizer.convert_tokens_to_ids("<|end_of_query|>")]
+        batch_inputs = process_batch(
+            tokenizer, self.config, image_prompt_pairs,
+            max_length=4096, min_dimension=min_dimension, max_dimension=max_dimension,
+        )
+        batch_inputs = {k: (v.to(device) if torch.is_tensor(v) else v) for k, v in batch_inputs.items()}
+        tokens = batch_inputs["tokens"]
+        B, L = tokens.size()
+        block_size = 128
+        S = (L + max_new_tokens + block_size - 1) // block_size * block_size
+        assert S <= self.config.max_seq_len
+        rng = torch.Generator(device).manual_seed(seed) if seed is not None else None
+        kv_cache = KVCache(
+            max_batch_size=B, max_seq_length=S, n_heads=self.config.n_heads,
+            head_dim=self.config.head_dim, num_layers=self.config.n_layers,
+        )
+        padded_tokens = torch.full((B, S), self._pad_token_id, dtype=tokens.dtype, device=device)
+        padded_tokens[:, :L] = tokens
+        attention_mask = self.get_attention_mask(padded_tokens, max_len=S)
+        logits_BSV = self.forward(
+            tokens=tokens, rope_pos_t=batch_inputs["pos_t"], rope_pos_hw=batch_inputs["pos_hw"],
+            attention_mask=attention_mask, kv_cache=kv_cache,
+            pixel_values=batch_inputs["pixel_values"], pixel_mask=batch_inputs["pixel_mask"],
+        )
+        stop_ids = torch.tensor(stop_token_ids).to(device)
+        should_stop_B = torch.full((B,), False, dtype=torch.bool, device=device)
+        generated_ids: list[list[int]] = [[] for _ in range(B)]
+        while not torch.all(should_stop_B) and (pos := kv_cache.get_pos()) < S:
+            tokens_B1 = sample_next_token(logits_BSV[:, -1], rng, temperature, top_k)
+            if torch.any(should_stop_B):
+                tokens_B1 = tokens_B1.clone()
+                tokens_B1[should_stop_B, :] = self._pad_token_id
+            padded_tokens[:, pos] = tokens_B1[:, -1]
+            for b in range(B):
+                if not should_stop_B[b]:
+                    generated_ids[b].append(tokens_B1[b, 0].item())
+            logits_BSV = self.forward(
+                tokens=tokens_B1, attention_mask=attention_mask, kv_cache=kv_cache,
+            )
+            hit_stop_B = torch.isin(tokens_B1, stop_ids).any(dim=-1)
+            should_stop_B = should_stop_B.logical_or(hit_stop_B)
+        results = []
+        for b in range(B):
+            text = tokenizer.decode(generated_ids[b], skip_special_tokens=False)
+            text = text.replace("<|end_of_query|>", "").replace("<|end_of_text|>", "").strip()
+            results.append(text)
+        return results
+    # Main API: generate
+    @torch.inference_mode()
+    def generate(
+        self,
+        images,
+        *,
+        category: str | list[str] = "plain",
+        max_new_tokens: int = 4096,
+        temperature: float = 0.0,
+        top_k: int | None = None,
+        min_dimension: int = 64,
+        max_dimension: int = 1024,
+        compile: bool = True,
+        seed: int | None = 42,
+    ) -> list[str]:
+        """
+        Extract text from document images.
+        Args:
+            images: Single PIL Image (or path/URL) or list of them.
+            category: OCR category — one of "plain", "text", "table", "formula",
+                "caption", "footnote", "list-item", "page-footer", "page-header",
+                "section-header", "title". Can be a single string (applied to all
+                images) or a list (one per image).
+            max_new_tokens: Maximum generation steps.
+            temperature: Sampling temperature (0.0 = greedy).
+            top_k: Top-k sampling (None = disabled).
+            min_dimension: Min image side after resize.
+            max_dimension: Max image side after resize.
+            compile: Whether to torch.compile on first call.
+            seed: Random seed for reproducibility (None = non-deterministic).
+        Returns:
+            List of extracted text strings, one per image.
+        """
+        self._ensure_device_buffers()
+        if compile:
+            self.compile_model()
+        if isinstance(images, (str, Path, Image.Image)):
+            images = [images]
+        if isinstance(category, str):
+            category = [category] * len(images)
+        assert len(images) == len(category), "Must provide one category per image"
+        image_prompt_pairs = []
+        for img, cat in zip(images, category):
+            instruction = CATEGORY_PROMPTS.get(cat.strip().lower(), CATEGORY_PROMPTS["plain"])
+            prompt = f"<|image|>{instruction}\n<|OCR_PLAIN|>"
+            image_prompt_pairs.append((img, prompt))
+        return self._generate_batch(
+            image_prompt_pairs,
+            max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k,
+            min_dimension=min_dimension, max_dimension=max_dimension, seed=seed,
+        )
+    # Main API: generate_with_layout
+    @torch.inference_mode()
+    def generate_with_layout(
+        self,
+        images,
+        *,
+        max_new_tokens: int = 4096,
+        temperature: float = 0.0,
+        top_k: int | None = None,
+        min_dimension: int = 64,
+        max_dimension: int = 1024,
+        compile: bool = True,
+        seed: int | None = 42,
+        layout_threshold: float = 0.3,
+        layout_batch_size: int = 4,
+        ocr_batch_size: int = 32,
+        containment_threshold: float = 0.8,
+        layout_model: str = "PaddlePaddle/PP-DocLayoutV3_safetensors",
+    ) -> list[list[dict]]:
+        """
+        Run layout detection then OCR on each detected region.
+        Args:
+            images: Single PIL Image (or path/URL) or list of them.
+            max_new_tokens: Maximum generation steps per crop.
+            temperature: Sampling temperature (0.0 = greedy).
+            top_k: Top-k sampling (None = disabled).
+            min_dimension: Min crop side after resize for OCR.
+            max_dimension: Max crop side after resize for OCR.
+            compile: Whether to torch.compile on first call.
+            seed: Random seed for reproducibility.
+            layout_threshold: Confidence threshold for layout detections.
+            layout_batch_size: Batch size for layout detection.
+            ocr_batch_size: Batch size for OCR generation (chunks crops).
+            containment_threshold: Drop formula boxes >threshold contained in text boxes.
+            layout_model: HuggingFace model ID for layout detection.
+        Returns:
+            Per-image list of detections, each a dict with keys:
+            ``category``, ``bbox`` [x1,y1,x2,y2], ``score``, ``text``.
+        """
+        self._ensure_device_buffers()
+        if compile:
+            self.compile_model()
+        self._load_layout_model(layout_model)
+        if isinstance(images, (str, Path, Image.Image)):
+            images = [images]
+        pil_images = [load_image(img).convert("RGB") for img in images]
+        # --- Layout detection (batched) ---
+        all_layout_dets: list[list[dict]] = []
+        for i in range(0, len(pil_images), layout_batch_size):
+            batch_imgs = pil_images[i : i + layout_batch_size]
+            dets = self._run_layout_detection(batch_imgs, threshold=layout_threshold)
+            all_layout_dets.extend(dets)
+        # --- Filter nested boxes (e.g. inline formulas inside text) ---
+        all_layout_dets = [
+            _filter_nested_detections(dets, containment_threshold)
+            for dets in all_layout_dets
+        ]
+        # --- Build crops + track origin ---
+        flat_crops: list[tuple[Image.Image, str]] = []
+        crop_origins: list[tuple[int, int]] = []  # (image_idx, det_idx)
+        for img_idx, (pil_img, dets) in enumerate(zip(pil_images, all_layout_dets)):
+            if not dets or (len(dets) == 1 and dets[0]["category"].strip().lower() == "image"):
+                prompt = f"<|image|>{CATEGORY_PROMPTS['plain']}\n<|OCR_PLAIN|>"
+                flat_crops.append((pil_img, prompt))
+                crop_origins.append((img_idx, -1))
+                continue
+            img_w, img_h = pil_img.size
+            for det_idx, det in enumerate(dets):
+                cat_key = det["category"].strip().lower()
+                ocr_cat = LAYOUT_TO_OCR_CATEGORY.get(cat_key)
+                if ocr_cat is None:
+                    continue
+                x1, y1, x2, y2 = det["bbox"]
+                x1 = max(0, int(x1))
+                y1 = max(0, int(y1))
+                x2 = min(img_w, int(x2 + 0.5))
+                y2 = min(img_h, int(y2 + 0.5))
+                cw, ch = x2 - x1, y2 - y1
+                if cw < _MIN_CROP_DIM or ch < _MIN_CROP_DIM:
+                    continue
+                short, long = sorted((cw, ch))
+                resized_short = short * (max_dimension / long) if long > max_dimension else short
+                if resized_short < _MIN_CROP_DIM:
+                    continue
+                crop = pil_img.crop((x1, y1, x2, y2))
+                instruction = CATEGORY_PROMPTS.get(ocr_cat, CATEGORY_PROMPTS["plain"])
+                prompt = f"<|image|>{instruction}\n<|OCR_PLAIN|>"
+                flat_crops.append((crop, prompt))
+                crop_origins.append((img_idx, det_idx))
+        # --- OCR in chunks ---
+        flat_texts: list[str] = []
+        for i in range(0, max(len(flat_crops), 1), ocr_batch_size):
+            chunk = flat_crops[i : i + ocr_batch_size]
+            if not chunk:
+                break
+            texts = self._generate_batch(
+                chunk,
+                max_new_tokens=max_new_tokens, temperature=temperature, top_k=top_k,
+                min_dimension=min_dimension, max_dimension=max_dimension, seed=seed,
+            )
+            flat_texts.extend(texts)
+        # --- Reassemble per-image results ---
+        results: list[list[dict]] = [[] for _ in range(len(pil_images))]
+        for (img_idx, det_idx), text in zip(crop_origins, flat_texts):
+            if det_idx == -1:
+                img_w, img_h = pil_images[img_idx].size
+                results[img_idx].append({
+                    "category": "plain",
+                    "bbox": [0, 0, img_w, img_h],
+                    "score": 1.0,
+                    "text": text,
+                })
+            else:
+                det = all_layout_dets[img_idx][det_idx]
+                results[img_idx].append({
+                    "category": det["category"],
+                    "bbox": det["bbox"],
+                    "score": det["score"],
+                    "text": text,
+                })
+        return results

processing_falcon_ocr.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import io
+import math
+import einops as E
+import numpy as np
+import requests
+import torch
+from PIL import Image
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_transforms import convert_to_rgb, resize
+from transformers.image_utils import (
+    ImageInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+IMAGE_MEAN = [0.5, 0.5, 0.5]
+IMAGE_STD = [0.5, 0.5, 0.5]
+def load_image(image):
+    if image is None:
+        return None
+    if isinstance(image, Image.Image):
+        return image
+    if isinstance(image, str):
+        if image.startswith(("http://", "https://")):
+            response = requests.get(image, timeout=10)
+            response.raise_for_status()
+            return Image.open(io.BytesIO(response.content))
+        if image.endswith(".npy"):
+            img_array = io.BytesIO(np.load(image))
+            return Image.open(img_array)
+        return Image.open(image)
+    if isinstance(image, np.bytes_):
+        return Image.open(io.BytesIO(image))
+    if isinstance(image, np.ndarray):
+        return Image.fromarray(image)
+    raise TypeError(f"Unknown image format {image}")
+def load_images(images_input, min_dimension: int, max_dimension: int):
+    images = []
+    if images_input is not None:
+        for inp in images_input:
+            img = load_image(inp)
+            img = resize_image_if_necessary(img, min_dimension, max_dimension)
+            images.append(img)
+    return images
+def resize_image_if_necessary(
+    image,
+    shortest_dimension=224,
+    longest_dimension=896,
+):
+    original_width, original_height = image.size
+    aspect_ratio = original_width / original_height
+    if (
+        shortest_dimension <= original_width <= longest_dimension
+        and shortest_dimension <= original_height <= longest_dimension
+    ):
+        return image
+    is_vertical_image = original_width < original_height
+    if original_width < shortest_dimension or original_height < shortest_dimension:
+        if is_vertical_image:
+            new_width = shortest_dimension
+            new_height = int(new_width / aspect_ratio)
+        else:
+            new_height = shortest_dimension
+            new_width = int(new_height * aspect_ratio)
+    else:
+        if is_vertical_image:
+            new_width = longest_dimension
+            new_height = int(new_width / aspect_ratio)
+        else:
+            new_height = longest_dimension
+            new_width = int(new_height * aspect_ratio)
+    if new_width > longest_dimension:
+        new_width = longest_dimension
+        new_height = int(new_width / aspect_ratio)
+    if new_height > longest_dimension:
+        new_height = longest_dimension
+        new_width = int(new_height * aspect_ratio)
+    resized_image = image.resize((new_width, new_height))
+    return resized_image
+def smart_resize(
+    image,
+    factor: int,
+    resample,
+    input_data_format,
+    min_pixels: int = 56 * 56,
+    max_pixels: int = 14 * 14 * 4 * 1280,
+):
+    height, width = get_image_size(image, channel_dim=input_data_format)
+    if height < factor or width < factor:
+        raise ValueError(f"{height=} or {width=} must be larger than {factor=}")
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = np.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = np.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    image = resize(
+        image,
+        size=(h_bar, w_bar),
+        resample=resample,
+        input_data_format=input_data_format,
+    )
+    return image
+class ImageProcessor(BaseImageProcessor):
+    def __init__(
+        self,
+        patch_size,
+        merge_size,
+        do_resize: bool = True,
+        resample: Image.Resampling = Image.Resampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean or IMAGE_MEAN
+        self.image_std = image_std or IMAGE_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+        validate_preprocess_arguments(
+            rescale_factor=self.rescale_factor,
+            do_normalize=self.do_normalize,
+            image_mean=self.image_mean,
+            image_std=self.image_std,
+            do_resize=self.do_resize,
+            size=self.size,
+            resample=self.resample,
+        )
+    def _preprocess(self, image: ImageInput, do_rescale=None, do_normalize=None):
+        if self.do_convert_rgb:
+            image = convert_to_rgb(image)
+        image = to_numpy_array(image)
+        input_data_format = infer_channel_dimension_format(image)
+        if self.do_resize:
+            image = smart_resize(
+                image,
+                factor=self.patch_size * self.merge_size,
+                resample=self.resample,
+                input_data_format=input_data_format,
+                min_pixels=self.min_pixels,
+                max_pixels=self.max_pixels,
+            )
+        if do_rescale or self.do_rescale:
+            image = self.rescale(image, scale=self.rescale_factor, input_data_format=input_data_format)
+        if do_normalize or self.do_normalize:
+            image = self.normalize(
+                image=image, mean=self.image_mean, std=self.image_std,
+                input_data_format=input_data_format,
+            )
+        return image
+    def preprocess(self, images: list[ImageInput] | None, do_rescale=None, do_normalize=None, **kwargs):
+        del kwargs
+        if images is None:
+            return []
+        images = [item for item in images if item is not None]
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        pixel_values = []
+        for image in images:
+            processed_image = self._preprocess(image, do_rescale, do_normalize)
+            processed_image = processed_image[None, ...]
+            pixel_values.append(processed_image)
+        return pixel_values
+    def batch_images_with_mask(self, pixel_values, max_image_height, max_image_width):
+        if pixel_values is None:
+            return None
+        pixel_values = [item for item in pixel_values if item is not None and len(item) != 0]
+        if len(pixel_values) == 0:
+            return None
+        pixel_values = [torch.from_numpy(img) for img in pixel_values]
+        max_temporal = max(img.shape[0] for img in pixel_values)
+        def pad_image_and_mask(img):
+            time_steps, height, width, channels = img.shape
+            if channels != 3:
+                raise ValueError(f"Expected 3-channel RGB images, got {channels} channels.")
+            padding = (0, 0, 0, max_image_width - width, 0, max_image_height - height, 0, max_temporal - time_steps)
+            padded_image = torch.nn.functional.pad(img, padding)
+            mask = torch.zeros((max_temporal, max_image_height, max_image_width), dtype=torch.long)
+            mask[:time_steps, :height, :width] = 1
+            return padded_image, mask
+        padded_pixel_values, padding_masks = zip(*[pad_image_and_mask(img) for img in pixel_values])
+        padded_pixel_values = torch.stack(list(padded_pixel_values))
+        padding_masks = torch.stack(list(padding_masks))
+        return {"pixel_values": padded_pixel_values, "padding_mask": padding_masks}
+# ---------------------------------------------------------------------------
+# Positional encoding helpers
+# ---------------------------------------------------------------------------
+def _compute_image_spatial_positions(
+    pixel_mask_THW: torch.Tensor,
+    spatial_patch_size: int,
+    temporal_patch_size: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    mask_thw = E.reduce(
+        pixel_mask_THW,
+        "(t tp) (h hp) (w wp) -> t h w",
+        reduction="any",
+        tp=temporal_patch_size,
+        hp=spatial_patch_size,
+        wp=spatial_patch_size,
+    )
+    width = E.reduce(mask_thw.sum(dim=-1).int(), "t h -> ", reduction="max")
+    height = E.reduce(mask_thw.sum(dim=-2).int(), "t w -> ", reduction="max")
+    xlim = torch.sqrt(width / height)
+    ylim = torch.sqrt(height / width)
+    xpos = torch.linspace(-xlim, xlim, int(width))
+    ypos = torch.linspace(-ylim, ylim, int(height))
+    wpos, hpos = torch.meshgrid(xpos, ypos, indexing="xy")
+    return hpos.flatten(), wpos.flatten()
+def _get_image_token_masks(tokens, config):
+    spatial_mask = tokens == config.img_id
+    no_increase_mask = (
+        spatial_mask
+        | (tokens == config.image_reg_1_token_id)
+        | (tokens == config.image_reg_2_token_id)
+        | (tokens == config.image_reg_3_token_id)
+        | (tokens == config.image_reg_4_token_id)
+        | (tokens == config.img_end_id)
+    )
+    return spatial_mask, no_increase_mask
+def get_pos_thw(
+    tokens: torch.Tensor,
+    pixel_masks_NTHW: torch.Tensor,
+    config,
+    spatial_patch_size: int,
+    temporal_patch_size: int = 1,
+    pad_token_id: int = None,
+):
+    assert pad_token_id is not None
+    assert tokens.ndim == 2
+    assert pixel_masks_NTHW.ndim == 4
+    spatial_img_token_mask_BS, no_increase_idx_img_token_mask_BS = _get_image_token_masks(tokens, config)
+    hpos_parts, wpos_parts = [], []
+    for i in range(pixel_masks_NTHW.shape[0]):
+        h, w = _compute_image_spatial_positions(pixel_masks_NTHW[i], spatial_patch_size, temporal_patch_size)
+        hpos_parts.append(h)
+        wpos_parts.append(w)
+    hpos_N = torch.cat(hpos_parts) if hpos_parts else torch.empty(0)
+    wpos_N = torch.cat(wpos_parts) if wpos_parts else torch.empty(0)
+    expected_tokens = spatial_img_token_mask_BS.sum().item()
+    actual_tokens = hpos_N.numel()
+    assert actual_tokens == expected_tokens, (
+        f"Mismatch between spatial image tokens ({expected_tokens}) and generated positions ({actual_tokens})."
+    )
+    hpos_BS = torch.full_like(tokens, fill_value=torch.nan, dtype=torch.float, device=tokens.device)
+    wpos_BS = torch.full_like(tokens, fill_value=torch.nan, dtype=torch.float, device=tokens.device)
+    hpos_BS = hpos_BS.masked_scatter_(spatial_img_token_mask_BS, hpos_N)
+    wpos_BS = wpos_BS.masked_scatter_(spatial_img_token_mask_BS, wpos_N)
+    tpos_BS = torch.ones_like(tokens, dtype=torch.float, device=tokens.device)
+    tpos_BS[no_increase_idx_img_token_mask_BS] = 0
+    tpos_BS = torch.cumsum(tpos_BS, dim=1) - 1
+    tpos_BS[tokens == pad_token_id] = 0
+    hw_pos_BS2 = torch.stack([hpos_BS, wpos_BS], dim=-1)
+    return tpos_BS.long(), hw_pos_BS2
+def calculate_image_tokens(image, patch_size, merge_size):
+    height, width = get_image_size(image)
+    return int((height * width) / (patch_size * patch_size * merge_size * merge_size))
+def tokenize_inputs(prompt, images, tokenizer, config, patch_size, merge_size, max_length):
+    img_reg_ids = [
+        config.image_reg_1_token_id,
+        config.image_reg_2_token_id,
+        config.image_reg_3_token_id,
+        config.image_reg_4_token_id,
+    ]
+    if images is not None and len(images) > 0:
+        image_token_counts = [calculate_image_tokens(image, patch_size, merge_size) for image in images]
+    else:
+        image_token_counts = []
+    image_token = tokenizer.convert_ids_to_tokens(config.img_id)
+    prompt_chunks = [tokenizer.encode(chunk) for chunk in prompt.split(image_token)]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, sep) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    bos_id = getattr(tokenizer, "bos_token_id", None)
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and bos_id is not None and prompt_chunks[0][0] == bos_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    separators = []
+    for count in image_token_counts:
+        tokens = [config.img_id] * count
+        image_block = [config.image_cls_token_id, *img_reg_ids, *tokens, config.img_end_id]
+        separators.append(image_block)
+    if len(separators) != 0 and len(separators) != len(prompt_chunks):
+        separators.append(separators[-1])
+    selected_images = []
+    if len(separators) == 0:
+        input_ids = prompt_chunks[0]
+    else:
+        for index, x in enumerate(insert_separator(prompt_chunks, separators)):
+            if index % 2 != 0:
+                if (len(input_ids) + len(x)) < max_length:
+                    input_ids.extend(x)
+                    selected_images.append(images[index // 2])
+            elif index % 2 == 0:
+                input_ids.extend(x[offset:])
+    input_ids = torch.LongTensor(input_ids)
+    return input_ids, selected_images
+def process_batch(
+    tokenizer,
+    config,
+    image_prompt_pairs,
+    max_length,
+    min_dimension,
+    max_dimension,
+    patch_size=16,
+    merge_size=1,
+):
+    """
+    Process a batch of images with text prompts.
+    Uses LEFT PADDING for proper batch generation with causal models.
+    """
+    all_input_ids = []
+    all_selected_images = []
+    processor_local = ImageProcessor(patch_size, merge_size)
+    for img_input, prompt in image_prompt_pairs:
+        img = load_image(img_input)
+        if img is not None:
+            img = resize_image_if_necessary(img, min_dimension, max_dimension)
+        images = processor_local.preprocess(images=[img] if img else [])
+        input_ids, selected_images = tokenize_inputs(
+            prompt, images, tokenizer, config, patch_size, merge_size, max_length,
+        )
+        all_input_ids.append(input_ids)
+        all_selected_images.extend(selected_images)
+    pad_token_id = tokenizer.convert_tokens_to_ids("<|pad|>")
+    padded_input_ids = torch.nn.utils.rnn.pad_sequence(
+        all_input_ids, batch_first=True, padding_value=pad_token_id, padding_side="left",
+    )
+    processed = processor_local.batch_images_with_mask(all_selected_images, max_dimension, max_dimension)
+    assert processed is not None
+    pos_t, pos_hw = get_pos_thw(
+        padded_input_ids, processed["padding_mask"], config, patch_size, pad_token_id=pad_token_id,
+    )
+    return {
+        "tokens": padded_input_ids,
+        "pixel_values": processed["pixel_values"],
+        "pixel_mask": processed["padding_mask"],
+        "pos_t": pos_t,
+        "pos_hw": pos_hw,
+        "pad_token_id": pad_token_id,
+    }

rope.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import einops as E
+import torch
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor:
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        end (int): End index for precomputing frequencies.
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis  # [S, D//2]
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """1D rotary embedding"""
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    assert freqs_cis.ndim == 3, (
+        "Freqs_cis must be indexed by position ids already and has shape (B,S,D)"
+    )
+    freqs_cis = E.rearrange(freqs_cis, "b s d -> b s 1 d")
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+###### 2D golden rope
+"""
+Dimension key:
+    B: batch size
+    S: number of tokens per sample, Seqlen
+    T: Number of selected Tokens
+    P: pos_dim
+    h: n_heads
+    d: head_dim
+    F: num_freqs == head_dim // 2
+"""
+def apply_golden_freqs_cis_to_visual_pos(freqs_hFP, pos_BSP) -> torch.Tensor:
+    """
+    This function is applied once per input batch, and the cached
+    freqs_cis is passed through to all layers.
+    Safe for Torch‑Inductor because it never uses boolean indexing on a symbolic tensor.
+    """
+    # 1. Boolean mask → integer indices (no unbacked shapes)
+    img_mask_BS = E.reduce(~torch.isnan(pos_BSP), 'b s p -> b s', reduction='all')
+    idx_b, idx_s = torch.nonzero(img_mask_BS, as_tuple=True)   # each shape: (N,)
+    # 2. Gather the positional tensor for those tokens
+    pos_tP = pos_BSP[idx_b, idx_s].float() # (N, p)
+    # 3. Project positions onto the frequency table → angles θ
+    theta_thF = torch.einsum("tp,hfp->thf", pos_tP, freqs_hFP.float())  # (t, h, f)
+    # 4. Convert to complex numbers on the unit circle
+    freqs_cis_thF = torch.polar(torch.ones_like(theta_thF), theta_thF)
+    return freqs_cis_thF
+def apply_golden_rotary_emb(input_BShd, freqs_cis_thF, pos_BSP) -> torch.Tensor:
+    """
+    Rotates *only* the image tokens in `input_BShd`.  No boolean indexing,
+    so it is safe for Torch‑Inductor.
+    """
+    img_mask_BS = E.reduce(~torch.isnan(pos_BSP), 'b s p -> b s', reduction='all')
+    idx_b, idx_s = torch.nonzero(img_mask_BS, as_tuple=True)  # (N,)
+    input_thd = input_BShd[idx_b, idx_s].float()  # (N, h, d)
+    x_even = input_thd[..., 0::2]  # (N, h, F)
+    x_odd = input_thd[..., 1::2]   # (N, h, F)
+    cos_thF = freqs_cis_thF.real
+    sin_thF = freqs_cis_thF.imag
+    # (a + ib) * (c + id) = (ac - bd) + i(ad + bc)
+    rot_even = x_even * cos_thF - x_odd * sin_thF
+    rot_odd = x_even * sin_thF + x_odd * cos_thF
+    output_real = torch.empty_like(input_thd)
+    output_real[..., 0::2] = rot_even
+    output_real[..., 1::2] = rot_odd
+    output_real = output_real.type_as(input_BShd)
+    output_BShd = input_BShd.clone()
+    output_BShd[idx_b, idx_s] = output_real
+    return output_BShd
+def apply_3d_rotary_emb(
+    xq: torch.Tensor,  # (B, S, H, D)
+    xk: torch.Tensor,  # (B, S, H, D)
+    freqs_cis: torch.Tensor,
+    freqs_cis_2d: torch.Tensor | None,
+    pos_hw: torch.Tensor | None,  # (B,S,3)
+) -> tuple[torch.Tensor, torch.Tensor]:
+    xq_t, xq_hw = xq.chunk(chunks=2, dim=-1)
+    xk_t, xk_hw = xk.chunk(chunks=2, dim=-1)
+    B, S, H, D = xq.shape
+    xq_t, xk_t = apply_rotary_emb(xq_t, xk_t, freqs_cis)
+    if freqs_cis_2d is not None and pos_hw is not None:
+        xq_hw = apply_golden_rotary_emb(xq_hw, freqs_cis_2d, pos_hw)
+        xk_hw = apply_golden_rotary_emb(xk_hw, freqs_cis_2d, pos_hw)
+    xq_out = torch.concat([xq_t, xq_hw], dim=-1).type_as(xq)
+    xk_out = torch.concat([xk_t, xk_hw], dim=-1).type_as(xk)
+    return xq_out, xk_out

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,390 @@

+{
+  "additional_special_tokens": [
+    "<|pad|>",
+    ">>ABSTRACT<<",
+    ">>INTRODUCTION<<",
+    ">>SUMMARY<<",
+    ">>COMMENT<<",
+    ">>ANSWER<<",
+    ">>QUESTION<<",
+    ">>DOMAIN<<",
+    ">>PREFIX<<",
+    ">>SUFFIX<<",
+    ">>MIDDLE<<",
+    "<|finetune_right_pad_id|>",
+    "<|start_header_id|>",
+    "<|end_header_id|>",
+    "<|eom_id|>",
+    "<|eot_id|>",
+    "<|begin_of_text|>",
+    ">>TITLE<<",
+    "<tool_response>",
+    "</tool_response>",
+    "<tool_call>",
+    "</tool_call>",
+    "<schema>",
+    "</schema>",
+    "<scratch_pad>",
+    "</scratch_pad>",
+    "<thinking>",
+    "</thinking>",
+    "<explanation>",
+    "</explanation>",
+    "<file_sep>",
+    "<repo_name>",
+    "<tr>",
+    "</tr>",
+    "<|image|>",
+    "<|image_row_sep|>",
+    "<|start_of_image|>",
+    "<|end_of_image|>",
+    "<|start_of_video|>",
+    "<|end_of_video|>",
+    "<|frame_sep|>",
+    "<|start_of_turn|>",
+    "<|end_of_turn|>",
+    "<|start_of_diffusion_query|>",
+    "<|end_of_diffusion_query|>",
+    "<|diffusion_query|>",
+    "<|object|>",
+    "<|coord|>",
+    "<|size|>",
+    "<|perceive|>",
+    "<|image_mask_token|>",
+    "<|image_cls|>",
+    "<|image_reg_1|>",
+    "<|image_reg_2|>",
+    "<|image_reg_3|>",
+    "<|image_reg_4|>",
+    "<|image_reg_5|>",
+    "<|image_reg_6|>",
+    "<|image_reg_7|>",
+    "<|image_reg_8|>",
+    "<|DET|>",
+    "<|POINTING|>",
+    "<|OCR_GROUNDING|>",
+    "<|OCR_DOC_PARSER|>",
+    "<|OCR_PLAIN|>",
+    "<|REF_SEG|>",
+    "<|POINT_REF_SEG|>",
+    "<|CAPTION|>",
+    "<|DETAILED_CAPTION|>",
+    "<|seg|>",
+    "<|end_of_query|>",
+    "<|start_of_query|>",
+    "<|task_sep|>",
+    "<|QA|>",
+    "<|LAYOUT_DETECTION|>",
+    "<|category_sep|>",
+    "<td>",
+    "</td>",
+    "<th>",
+    "</th>",
+    ">>UNUSED_261<<",
+    ">>UNUSED_262<<",
+    ">>UNUSED_263<<",
+    ">>UNUSED_264<<",
+    ">>UNUSED_265<<",
+    ">>UNUSED_266<<",
+    ">>UNUSED_267<<",
+    ">>UNUSED_268<<",
+    ">>UNUSED_269<<",
+    ">>UNUSED_270<<",
+    ">>UNUSED_271<<",
+    ">>UNUSED_272<<",
+    ">>UNUSED_273<<",
+    ">>UNUSED_274<<",
+    ">>UNUSED_275<<",
+    ">>UNUSED_276<<",
+    ">>UNUSED_277<<",
+    ">>UNUSED_278<<",
+    ">>UNUSED_279<<",
+    ">>UNUSED_280<<",
+    ">>UNUSED_281<<",
+    ">>UNUSED_282<<",
+    ">>UNUSED_283<<",
+    ">>UNUSED_284<<",
+    ">>UNUSED_285<<",
+    ">>UNUSED_286<<",
+    ">>UNUSED_287<<",
+    ">>UNUSED_288<<",
+    ">>UNUSED_289<<",
+    ">>UNUSED_290<<",
+    ">>UNUSED_291<<",
+    ">>UNUSED_292<<",
+    ">>UNUSED_293<<",
+    ">>UNUSED_294<<",
+    ">>UNUSED_295<<",
+    ">>UNUSED_296<<",
+    ">>UNUSED_297<<",
+    ">>UNUSED_298<<",
+    ">>UNUSED_299<<",
+    ">>UNUSED_300<<",
+    ">>UNUSED_301<<",
+    ">>UNUSED_302<<",
+    ">>UNUSED_303<<",
+    ">>UNUSED_304<<",
+    ">>UNUSED_305<<",
+    ">>UNUSED_306<<",
+    ">>UNUSED_307<<",
+    ">>UNUSED_308<<",
+    ">>UNUSED_309<<",
+    ">>UNUSED_310<<",
+    ">>UNUSED_311<<",
+    ">>UNUSED_312<<",
+    ">>UNUSED_313<<",
+    ">>UNUSED_314<<",
+    ">>UNUSED_315<<",
+    ">>UNUSED_316<<",
+    ">>UNUSED_317<<",
+    ">>UNUSED_318<<",
+    ">>UNUSED_319<<",
+    ">>UNUSED_320<<",
+    ">>UNUSED_321<<",
+    ">>UNUSED_322<<",
+    ">>UNUSED_323<<",
+    ">>UNUSED_324<<",
+    ">>UNUSED_325<<",
+    ">>UNUSED_326<<",
+    ">>UNUSED_327<<",
+    ">>UNUSED_328<<",
+    ">>UNUSED_329<<",
+    ">>UNUSED_330<<",
+    ">>UNUSED_331<<",
+    ">>UNUSED_332<<",
+    ">>UNUSED_333<<",
+    ">>UNUSED_334<<",
+    ">>UNUSED_335<<",
+    ">>UNUSED_336<<",
+    ">>UNUSED_337<<",
+    ">>UNUSED_338<<",
+    ">>UNUSED_339<<",
+    ">>UNUSED_340<<",
+    ">>UNUSED_341<<",
+    ">>UNUSED_342<<",
+    ">>UNUSED_343<<",
+    ">>UNUSED_344<<",
+    ">>UNUSED_345<<",
+    ">>UNUSED_346<<",
+    ">>UNUSED_347<<",
+    ">>UNUSED_348<<",
+    ">>UNUSED_349<<",
+    ">>UNUSED_350<<",
+    ">>UNUSED_351<<",
+    ">>UNUSED_352<<",
+    ">>UNUSED_353<<",
+    ">>UNUSED_354<<",
+    ">>UNUSED_355<<",
+    ">>UNUSED_356<<",
+    ">>UNUSED_357<<",
+    ">>UNUSED_358<<",
+    ">>UNUSED_359<<",
+    ">>UNUSED_360<<",
+    ">>UNUSED_361<<",
+    ">>UNUSED_362<<",
+    ">>UNUSED_363<<",
+    ">>UNUSED_364<<",
+    ">>UNUSED_365<<",
+    ">>UNUSED_366<<",
+    ">>UNUSED_367<<",
+    ">>UNUSED_368<<",
+    ">>UNUSED_369<<",
+    ">>UNUSED_370<<",
+    ">>UNUSED_371<<",
+    ">>UNUSED_372<<",
+    ">>UNUSED_373<<",
+    ">>UNUSED_374<<",
+    ">>UNUSED_375<<",
+    ">>UNUSED_376<<",
+    ">>UNUSED_377<<",
+    ">>UNUSED_378<<",
+    ">>UNUSED_379<<",
+    ">>UNUSED_380<<",
+    ">>UNUSED_381<<",
+    ">>UNUSED_382<<",
+    ">>UNUSED_383<<",
+    ">>UNUSED_384<<",
+    ">>UNUSED_385<<",
+    ">>UNUSED_386<<",
+    ">>UNUSED_387<<",
+    ">>UNUSED_388<<",
+    ">>UNUSED_389<<",
+    ">>UNUSED_390<<",
+    ">>UNUSED_391<<",
+    ">>UNUSED_392<<",
+    ">>UNUSED_393<<",
+    ">>UNUSED_394<<",
+    ">>UNUSED_395<<",
+    ">>UNUSED_396<<",
+    ">>UNUSED_397<<",
+    ">>UNUSED_398<<",
+    ">>UNUSED_399<<",
+    ">>UNUSED_400<<",
+    ">>UNUSED_401<<",
+    ">>UNUSED_402<<",
+    ">>UNUSED_403<<",
+    ">>UNUSED_404<<",
+    ">>UNUSED_405<<",
+    ">>UNUSED_406<<",
+    ">>UNUSED_407<<",
+    ">>UNUSED_408<<",
+    ">>UNUSED_409<<",
+    ">>UNUSED_410<<",
+    ">>UNUSED_411<<",
+    ">>UNUSED_412<<",
+    ">>UNUSED_413<<",
+    ">>UNUSED_414<<",
+    ">>UNUSED_415<<",
+    ">>UNUSED_416<<",
+    ">>UNUSED_417<<",
+    ">>UNUSED_418<<",
+    ">>UNUSED_419<<",
+    ">>UNUSED_420<<",
+    ">>UNUSED_421<<",
+    ">>UNUSED_422<<",
+    ">>UNUSED_423<<",
+    ">>UNUSED_424<<",
+    ">>UNUSED_425<<",
+    ">>UNUSED_426<<",
+    ">>UNUSED_427<<",
+    ">>UNUSED_428<<",
+    ">>UNUSED_429<<",
+    ">>UNUSED_430<<",
+    ">>UNUSED_431<<",
+    ">>UNUSED_432<<",
+    ">>UNUSED_433<<",
+    ">>UNUSED_434<<",
+    ">>UNUSED_435<<",
+    ">>UNUSED_436<<",
+    ">>UNUSED_437<<",
+    ">>UNUSED_438<<",
+    ">>UNUSED_439<<",
+    ">>UNUSED_440<<",
+    ">>UNUSED_441<<",
+    ">>UNUSED_442<<",
+    ">>UNUSED_443<<",
+    ">>UNUSED_444<<",
+    ">>UNUSED_445<<",
+    ">>UNUSED_446<<",
+    ">>UNUSED_447<<",
+    ">>UNUSED_448<<",
+    ">>UNUSED_449<<",
+    ">>UNUSED_450<<",
+    ">>UNUSED_451<<",
+    ">>UNUSED_452<<",
+    ">>UNUSED_453<<",
+    ">>UNUSED_454<<",
+    ">>UNUSED_455<<",
+    ">>UNUSED_456<<",
+    ">>UNUSED_457<<",
+    ">>UNUSED_458<<",
+    ">>UNUSED_459<<",
+    ">>UNUSED_460<<",
+    ">>UNUSED_461<<",
+    ">>UNUSED_462<<",
+    ">>UNUSED_463<<",
+    ">>UNUSED_464<<",
+    ">>UNUSED_465<<",
+    ">>UNUSED_466<<",
+    ">>UNUSED_467<<",
+    ">>UNUSED_468<<",
+    ">>UNUSED_469<<",
+    ">>UNUSED_470<<",
+    ">>UNUSED_471<<",
+    ">>UNUSED_472<<",
+    ">>UNUSED_473<<",
+    ">>UNUSED_474<<",
+    ">>UNUSED_475<<",
+    ">>UNUSED_476<<",
+    ">>UNUSED_477<<",
+    ">>UNUSED_478<<",
+    ">>UNUSED_479<<",
+    ">>UNUSED_480<<",
+    ">>UNUSED_481<<",
+    ">>UNUSED_482<<",
+    ">>UNUSED_483<<",
+    ">>UNUSED_484<<",
+    ">>UNUSED_485<<",
+    ">>UNUSED_486<<",
+    ">>UNUSED_487<<",
+    ">>UNUSED_488<<",
+    ">>UNUSED_489<<",
+    ">>UNUSED_490<<",
+    ">>UNUSED_491<<",
+    ">>UNUSED_492<<",
+    ">>UNUSED_493<<",
+    ">>UNUSED_494<<",
+    ">>UNUSED_495<<",
+    ">>UNUSED_496<<",
+    ">>UNUSED_497<<",
+    ">>UNUSED_498<<",
+    ">>UNUSED_499<<",
+    ">>UNUSED_500<<",
+    ">>UNUSED_501<<",
+    ">>UNUSED_502<<",
+    ">>UNUSED_503<<",
+    ">>UNUSED_504<<",
+    ">>UNUSED_505<<",
+    ">>UNUSED_506<<",
+    ">>UNUSED_507<<",
+    ">>UNUSED_508<<",
+    ">>UNUSED_509<<",
+    ">>UNUSED_510<<",
+    ">>UNUSED_511<<"
+  ],
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<|image|>",
+  "image_cls_token": "<|image_cls|>",
+  "image_reg_1_token": "<|image_reg_1|>",
+  "image_reg_2_token": "<|image_reg_2|>",
+  "image_reg_3_token": "<|image_reg_3|>",
+  "image_reg_4_token": "<|image_reg_4|>",
+  "image_reg_5_token": "<|image_reg_5|>",
+  "image_reg_6_token": "<|image_reg_6|>",
+  "image_reg_7_token": "<|image_reg_7|>",
+  "image_reg_8_token": "<|image_reg_8|>",
+  "image_row_sep_token": "<|image_row_sep|>",
+  "start_of_image_token": "<|start_of_image|>",
+  "end_of_image_token": "<|end_of_image|>",
+  "start_of_video_token": "<|start_of_video|>",
+  "end_of_video_token": "<|end_of_video|>",
+  "frame_sep_token": "<|frame_sep|>",
+  "start_of_turn_token": "<|start_of_turn|>",
+  "end_of_turn_token": "<|end_of_turn|>",
+  "start_of_diffusion_query_token": "<|start_of_diffusion_query|>",
+  "end_of_diffusion_query_token": "<|end_of_diffusion_query|>",
+  "diffusion_query_token": "<|diffusion_query|>",
+  "object_token": "<|object|>",
+  "coord_token": "<|coord|>",
+  "size_token": "<|size|>",
+  "perceive_token": "<|perceive|>",
+  "image_mask_token": "<|image_mask_token|>",
+  "det_token": "<|DET|>",
+  "pointing_token": "<|POINTING|>",
+  "ocr_grounding_token": "<|OCR_GROUNDING|>",
+  "ocr_doc_parser_token": "<|OCR_DOC_PARSER|>",
+  "ocr_plain_token": "<|OCR_PLAIN|>",
+  "ref_seg_token": "<|REF_SEG|>",
+  "point_ref_seg_token": "<|POINT_REF_SEG|>",
+  "caption_token": "<|CAPTION|>",
+  "detailed_caption_token": "<|DETAILED_CAPTION|>",
+  "seg_token": "<|seg|>",
+  "start_of_query_token": "<|start_of_query|>",
+  "end_of_query_token": "<|end_of_query|>",
+  "task_sep_token": "<|task_sep|>",
+  "qa_token": "<|QA|>",
+  "layout_detection_token": "<|LAYOUT_DETECTION|>",
+  "category_sep_token": "<|category_sep|>",
+  "table_row_start_token": "<tr>",
+  "table_row_end_token": "</tr>",
+  "table_data_start_token": "<td>",
+  "table_data_end_token": "</td>",
+  "table_header_start_token": "<th>",
+  "table_header_end_token": "</th>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,110 @@

+{
+  "backend": "tokenizers",
+  "caption_token": "<|CAPTION|>",
+  "category_sep_token": "<|category_sep|>",
+  "clean_up_tokenization_spaces": true,
+  "coord_token": "<|coord|>",
+  "det_token": "<|DET|>",
+  "detailed_caption_token": "<|DETAILED_CAPTION|>",
+  "diffusion_query_token": "<|diffusion_query|>",
+  "end_of_diffusion_query_token": "<|end_of_diffusion_query|>",
+  "end_of_image_token": "<|end_of_image|>",
+  "end_of_query_token": "<|end_of_query|>",
+  "end_of_turn_token": "<|end_of_turn|>",
+  "end_of_video_token": "<|end_of_video|>",
+  "eos_token": "<|end_of_text|>",
+  "frame_sep_token": "<|frame_sep|>",
+  "image_cls_token": "<|image_cls|>",
+  "image_mask_token": "<|image_mask_token|>",
+  "image_reg_1_token": "<|image_reg_1|>",
+  "image_reg_2_token": "<|image_reg_2|>",
+  "image_reg_3_token": "<|image_reg_3|>",
+  "image_reg_4_token": "<|image_reg_4|>",
+  "image_reg_5_token": "<|image_reg_5|>",
+  "image_reg_6_token": "<|image_reg_6|>",
+  "image_reg_7_token": "<|image_reg_7|>",
+  "image_reg_8_token": "<|image_reg_8|>",
+  "image_row_sep_token": "<|image_row_sep|>",
+  "image_token": "<|image|>",
+  "is_local": true,
+  "layout_detection_token": "<|LAYOUT_DETECTION|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 1000000000000000019884624838656,
+  "model_specific_special_tokens": {
+    "caption_token": "<|CAPTION|>",
+    "category_sep_token": "<|category_sep|>",
+    "coord_token": "<|coord|>",
+    "det_token": "<|DET|>",
+    "detailed_caption_token": "<|DETAILED_CAPTION|>",
+    "diffusion_query_token": "<|diffusion_query|>",
+    "end_of_diffusion_query_token": "<|end_of_diffusion_query|>",
+    "end_of_image_token": "<|end_of_image|>",
+    "end_of_query_token": "<|end_of_query|>",
+    "end_of_turn_token": "<|end_of_turn|>",
+    "end_of_video_token": "<|end_of_video|>",
+    "frame_sep_token": "<|frame_sep|>",
+    "image_cls_token": "<|image_cls|>",
+    "image_mask_token": "<|image_mask_token|>",
+    "image_reg_1_token": "<|image_reg_1|>",
+    "image_reg_2_token": "<|image_reg_2|>",
+    "image_reg_3_token": "<|image_reg_3|>",
+    "image_reg_4_token": "<|image_reg_4|>",
+    "image_reg_5_token": "<|image_reg_5|>",
+    "image_reg_6_token": "<|image_reg_6|>",
+    "image_reg_7_token": "<|image_reg_7|>",
+    "image_reg_8_token": "<|image_reg_8|>",
+    "image_row_sep_token": "<|image_row_sep|>",
+    "image_token": "<|image|>",
+    "layout_detection_token": "<|LAYOUT_DETECTION|>",
+    "object_token": "<|object|>",
+    "ocr_doc_parser_token": "<|OCR_DOC_PARSER|>",
+    "ocr_grounding_token": "<|OCR_GROUNDING|>",
+    "ocr_plain_token": "<|OCR_PLAIN|>",
+    "perceive_token": "<|perceive|>",
+    "point_ref_seg_token": "<|POINT_REF_SEG|>",
+    "pointing_token": "<|POINTING|>",
+    "qa_token": "<|QA|>",
+    "ref_seg_token": "<|REF_SEG|>",
+    "seg_token": "<|seg|>",
+    "size_token": "<|size|>",
+    "start_of_diffusion_query_token": "<|start_of_diffusion_query|>",
+    "start_of_image_token": "<|start_of_image|>",
+    "start_of_query_token": "<|start_of_query|>",
+    "start_of_turn_token": "<|start_of_turn|>",
+    "start_of_video_token": "<|start_of_video|>",
+    "table_data_end_token": "</td>",
+    "table_data_start_token": "<td>",
+    "table_header_end_token": "</th>",
+    "table_header_start_token": "<th>",
+    "table_row_end_token": "</tr>",
+    "table_row_start_token": "<tr>",
+    "task_sep_token": "<|task_sep|>"
+  },
+  "object_token": "<|object|>",
+  "ocr_doc_parser_token": "<|OCR_DOC_PARSER|>",
+  "ocr_grounding_token": "<|OCR_GROUNDING|>",
+  "ocr_plain_token": "<|OCR_PLAIN|>",
+  "perceive_token": "<|perceive|>",
+  "point_ref_seg_token": "<|POINT_REF_SEG|>",
+  "pointing_token": "<|POINTING|>",
+  "qa_token": "<|QA|>",
+  "ref_seg_token": "<|REF_SEG|>",
+  "seg_token": "<|seg|>",
+  "size_token": "<|size|>",
+  "start_of_diffusion_query_token": "<|start_of_diffusion_query|>",
+  "start_of_image_token": "<|start_of_image|>",
+  "start_of_query_token": "<|start_of_query|>",
+  "start_of_turn_token": "<|start_of_turn|>",
+  "start_of_video_token": "<|start_of_video|>",
+  "table_data_end_token": "</td>",
+  "table_data_start_token": "<td>",
+  "table_header_end_token": "</th>",
+  "table_header_start_token": "<th>",
+  "table_row_end_token": "</tr>",
+  "table_row_start_token": "<tr>",
+  "task_sep_token": "<|task_sep|>",
+  "tokenizer_class": "TokenizersBackend"
+}