overeducated commited on
Commit
6ad2801
·
1 Parent(s): 84d580c

docling: containerized docling-serve without UI; preseed EasyOCR models; expose v1 API on 7860

Browse files
Dockerfile CHANGED
@@ -1,42 +1,21 @@
1
- # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
- FROM python:3.12-slim
3
 
4
- ENV PYTHONUNBUFFERED=1 \
5
- PIP_NO_CACHE_DIR=off \
6
- UV_COMPILE_BYTECODE=1 \
7
- UV_LINK_MODE=copy \
8
- UV_PROJECT_ENVIRONMENT=/opt/app-root \
9
- OMP_NUM_THREADS=4 \
10
- CMAKE_BUILD_PARALLEL_LEVEL=4
11
 
12
- # System deps for easyocr (opencv, tesseract data optional)
13
- RUN apt-get update -y && apt-get install -y --no-install-recommends \
14
- build-essential \
15
- libgl1 \
16
- libglib2.0-0 \
17
- curl ca-certificates && \
18
- rm -rf /var/lib/apt/lists/*
19
 
20
- # Install uv
21
- RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
22
- /root/.cargo/bin/uv --version
23
-
24
- WORKDIR /app
25
-
26
- # Copy project metadata and source
27
- COPY pyproject.toml /app/pyproject.toml
28
- COPY src /app/src
29
-
30
- # Sync dependencies into project env
31
- RUN /root/.cargo/bin/uv sync --frozen
32
-
33
- # Create cache volume for EasyOCR
34
- VOLUME ["/cache"]
35
-
36
- # Expose and healthcheck
37
  EXPOSE 7860
38
- HEALTHCHECK --interval=20s --timeout=5s --start-period=10s --retries=5 \
39
- CMD sh -c 'curl -fsS http://127.0.0.1:${PORT:-7860}/ping > /dev/null || exit 1'
40
 
41
- ENV PORT=7860
42
- CMD ["/root/.cargo/bin/uv", "run", "uvicorn", "ocr_service.app:app", "--host", "0.0.0.0", "--port", "${PORT}"]
 
 
 
 
 
1
+ FROM ghcr.io/docling-project/docling-serve-cu128:main
 
2
 
3
+ ENV DOCLING_SERVE_ENABLE_UI=false \
4
+ UVICORN_PORT=7860 \
5
+ OMP_NUM_THREADS=4
 
 
 
 
6
 
7
+ # Pre-seed EasyOCR models into Docling cache to avoid download issues
8
+ RUN mkdir -p /opt/app-root/src/.cache/docling/models/EasyOcr && \
9
+ curl -L --fail --retry 5 --retry-connrefused -o /opt/app-root/src/.cache/docling/models/EasyOcr/craft_mlt_25k.pth \
10
+ https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/craft_mlt_25k.pth && \
11
+ curl -L --fail --retry 5 --retry-connrefused -o /opt/app-root/src/.cache/docling/models/EasyOcr/english_g2.pth \
12
+ https://github.com/JaidedAI/EasyOCR/releases/download/v1.6.2/english_g2.pth
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  EXPOSE 7860
 
 
15
 
16
+ # Healthcheck on API docs
17
+ HEALTHCHECK --interval=20s --timeout=5s --start-period=15s --retries=5 \
18
+ CMD sh -lc 'curl -fsS http://127.0.0.1:${UVICORN_PORT}/docs >/dev/null || exit 1'
19
+
20
+ ENTRYPOINT []
21
+ CMD ["docling-serve", "run", "--host", "0.0.0.0", "--port", "7860"]
pyproject.toml DELETED
@@ -1,19 +0,0 @@
1
- [project]
2
- name = "ocr-service"
3
- version = "0.1.0"
4
- description = "FastAPI OCR service with EasyOCR and OnnxTR (stub)"
5
- requires-python = ">=3.10"
6
- dependencies = [
7
- "fastapi>=0.115.0",
8
- "uvicorn[standard]>=0.30.0",
9
- "easyocr>=1.7.2",
10
- "opencv-python-headless>=4.10.0.84",
11
- "pymupdf>=1.24.10",
12
- "pillow>=10.3.0",
13
- "python-multipart>=0.0.9",
14
- "pydantic>=2.7.0",
15
- "requests>=2.32.0"
16
- ]
17
-
18
- [tool.uv]
19
- index-url = "https://pypi.org/simple"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt DELETED
@@ -1,2 +0,0 @@
1
- fastapi
2
- uvicorn[standard]
 
 
 
src/ocr_service/__init__.py DELETED
@@ -1 +0,0 @@
1
- __all__ = []
 
 
src/ocr_service/app.py DELETED
@@ -1,76 +0,0 @@
1
- import io
2
- import os
3
- from enum import Enum
4
- from typing import List
5
-
6
- import requests
7
- import fitz # PyMuPDF
8
- from fastapi import FastAPI, UploadFile, File, Form, HTTPException
9
- from fastapi.responses import JSONResponse
10
-
11
- app = FastAPI(title="OCR Service", version="0.1.0")
12
-
13
- class OcrEngine(str, Enum):
14
- EASYOCR = "easyocr"
15
- ONNXTR = "onnxtr"
16
-
17
-
18
- def get_easyocr_reader(lang: str = "en"):
19
- import easyocr
20
- cache_dir = os.environ.get("EASYOCR_CACHE", "/cache/EasyOcr")
21
- os.makedirs(cache_dir, exist_ok=True)
22
- return easyocr.Reader([lang], gpu=True, model_storage_directory=cache_dir)
23
-
24
-
25
- def pdf_bytes_to_images(pdf_bytes: bytes, dpi: int = 200) -> List[bytes]:
26
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
27
- images: List[bytes] = []
28
- for page in doc:
29
- mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
30
- pix = page.get_pixmap(matrix=mat)
31
- images.append(pix.tobytes("png"))
32
- doc.close()
33
- return images
34
-
35
-
36
- @app.get("/ping")
37
- async def ping() -> dict:
38
- return {"status": "ok"}
39
-
40
-
41
- @app.post("/ocr/url")
42
- async def ocr_from_url(url: str = Form(...), engine: OcrEngine = Form(OcrEngine.EASYOCR), lang: str = Form("en")):
43
- if engine == OcrEngine.ONNXTR:
44
- raise HTTPException(status_code=501, detail="OnnxTR engine not yet wired. Use engine=easyocr.")
45
- try:
46
- resp = requests.get(url, timeout=30)
47
- resp.raise_for_status()
48
- content = resp.content
49
- reader = get_easyocr_reader(lang)
50
- lines: List[str] = []
51
- if resp.headers.get("content-type", "").lower().startswith("application/pdf") or url.lower().endswith(".pdf"):
52
- for img in pdf_bytes_to_images(content):
53
- lines.extend(reader.readtext(img, detail=0))
54
- else:
55
- lines = reader.readtext(content, detail=0)
56
- return {"engine": engine, "lang": lang, "lines": lines}
57
- except Exception as e:
58
- raise HTTPException(status_code=400, detail=str(e))
59
-
60
-
61
- @app.post("/ocr/file")
62
- async def ocr_from_file(file: UploadFile = File(...), engine: OcrEngine = Form(OcrEngine.EASYOCR), lang: str = Form("en")):
63
- if engine == OcrEngine.ONNXTR:
64
- raise HTTPException(status_code=501, detail="OnnxTR engine not yet wired. Use engine=easyocr.")
65
- try:
66
- data = await file.read()
67
- reader = get_easyocr_reader(lang)
68
- lines: List[str] = []
69
- if file.filename.lower().endswith(".pdf"):
70
- for img in pdf_bytes_to_images(data):
71
- lines.extend(reader.readtext(img, detail=0))
72
- else:
73
- lines = reader.readtext(data, detail=0)
74
- return {"engine": engine, "lang": lang, "lines": lines}
75
- except Exception as e:
76
- raise HTTPException(status_code=400, detail=str(e))