File size: 2,493 Bytes
5e4028d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | """Pre-download the model files the pipeline needs at runtime.
Specifically:
- doctr's db_resnet50 detector. The library's built-in download path hits
a 308 redirect that urllib doesn't follow, leaving a 0-byte cache file.
We work around it with curl which follows redirects properly.
- TrOCR-base-handwritten (encoder + decoder). HuggingFace handles redirects
correctly so we just trigger the standard download.
Run once after `pip install -r requirements.txt`. Idempotent.
"""
from __future__ import annotations
import shutil
import subprocess
import sys
from pathlib import Path
DOCTR_MODEL_URL = (
"https://doctr-static.mindee.com/models?id=v0.7.0/db_resnet50-79bd7d70.pt&src=0"
)
DOCTR_CACHE = Path.home() / ".cache" / "doctr" / "models" / "db_resnet50-79bd7d70.pt"
MIN_DOCTR_BYTES = 50 * 1024 * 1024 # ~97 MB on disk; anything below 50 MB is suspect
def _ensure_doctr() -> None:
DOCTR_CACHE.parent.mkdir(parents=True, exist_ok=True)
if DOCTR_CACHE.exists() and DOCTR_CACHE.stat().st_size > MIN_DOCTR_BYTES:
print(f"[doctr] cached: {DOCTR_CACHE} ({DOCTR_CACHE.stat().st_size / 1e6:.0f} MB)")
return
if not shutil.which("curl"):
sys.exit(
"curl not found on PATH. Install curl, or download the doctr model "
f"manually from {DOCTR_MODEL_URL} to {DOCTR_CACHE}"
)
print(f"[doctr] downloading via curl -> {DOCTR_CACHE}")
result = subprocess.run(
["curl", "-fsSL", DOCTR_MODEL_URL, "-o", str(DOCTR_CACHE)],
)
if result.returncode != 0:
sys.exit(f"curl failed with exit code {result.returncode}")
size = DOCTR_CACHE.stat().st_size
if size < MIN_DOCTR_BYTES:
sys.exit(
f"Downloaded file is suspiciously small ({size} bytes). "
"Inspect the URL/network and retry."
)
print(f"[doctr] ok ({size / 1e6:.0f} MB)")
def _ensure_trocr() -> None:
"""Trigger HuggingFace's downloader for TrOCR by importing the module."""
print("[trocr] loading microsoft/trocr-base-handwritten (will download on first run)")
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
print("[trocr] ok")
def main() -> int:
_ensure_doctr()
_ensure_trocr()
print("\nAll models ready.")
return 0
if __name__ == "__main__":
sys.exit(main())
|