| """Pre-download the model files the pipeline needs at runtime. |
| |
| Specifically: |
| - doctr's db_resnet50 detector. The library's built-in download path hits |
| a 308 redirect that urllib doesn't follow, leaving a 0-byte cache file. |
| We work around it with curl which follows redirects properly. |
| - TrOCR-base-handwritten (encoder + decoder). HuggingFace handles redirects |
| correctly so we just trigger the standard download. |
| |
| Run once after `pip install -r requirements.txt`. Idempotent. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import shutil |
| import subprocess |
| import sys |
| from pathlib import Path |
|
|
| DOCTR_MODEL_URL = ( |
| "https://doctr-static.mindee.com/models?id=v0.7.0/db_resnet50-79bd7d70.pt&src=0" |
| ) |
| DOCTR_CACHE = Path.home() / ".cache" / "doctr" / "models" / "db_resnet50-79bd7d70.pt" |
| MIN_DOCTR_BYTES = 50 * 1024 * 1024 |
|
|
|
|
| def _ensure_doctr() -> None: |
| DOCTR_CACHE.parent.mkdir(parents=True, exist_ok=True) |
| if DOCTR_CACHE.exists() and DOCTR_CACHE.stat().st_size > MIN_DOCTR_BYTES: |
| print(f"[doctr] cached: {DOCTR_CACHE} ({DOCTR_CACHE.stat().st_size / 1e6:.0f} MB)") |
| return |
| if not shutil.which("curl"): |
| sys.exit( |
| "curl not found on PATH. Install curl, or download the doctr model " |
| f"manually from {DOCTR_MODEL_URL} to {DOCTR_CACHE}" |
| ) |
| print(f"[doctr] downloading via curl -> {DOCTR_CACHE}") |
| result = subprocess.run( |
| ["curl", "-fsSL", DOCTR_MODEL_URL, "-o", str(DOCTR_CACHE)], |
| ) |
| if result.returncode != 0: |
| sys.exit(f"curl failed with exit code {result.returncode}") |
| size = DOCTR_CACHE.stat().st_size |
| if size < MIN_DOCTR_BYTES: |
| sys.exit( |
| f"Downloaded file is suspiciously small ({size} bytes). " |
| "Inspect the URL/network and retry." |
| ) |
| print(f"[doctr] ok ({size / 1e6:.0f} MB)") |
|
|
|
|
| def _ensure_trocr() -> None: |
| """Trigger HuggingFace's downloader for TrOCR by importing the module.""" |
| print("[trocr] loading microsoft/trocr-base-handwritten (will download on first run)") |
| from transformers import TrOCRProcessor, VisionEncoderDecoderModel |
|
|
| TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") |
| VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") |
| print("[trocr] ok") |
|
|
|
|
| def main() -> int: |
| _ensure_doctr() |
| _ensure_trocr() |
| print("\nAll models ready.") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|