overeducated commited on
Commit
84d580c
·
0 Parent(s):

init: ocr-service (async FastAPI with EasyOCR; PDF support)

Browse files
Files changed (9) hide show
  1. .gitattributes +35 -0
  2. .gitignore +1 -0
  3. Dockerfile +42 -0
  4. README.md +12 -0
  5. git +0 -0
  6. pyproject.toml +19 -0
  7. requirements.txt +2 -0
  8. src/ocr_service/__init__.py +1 -0
  9. src/ocr_service/app.py +76 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ FROM python:3.12-slim
3
+
4
+ ENV PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=off \
6
+ UV_COMPILE_BYTECODE=1 \
7
+ UV_LINK_MODE=copy \
8
+ UV_PROJECT_ENVIRONMENT=/opt/app-root \
9
+ OMP_NUM_THREADS=4 \
10
+ CMAKE_BUILD_PARALLEL_LEVEL=4
11
+
12
+ # System deps for easyocr (opencv, tesseract data optional)
13
+ RUN apt-get update -y && apt-get install -y --no-install-recommends \
14
+ build-essential \
15
+ libgl1 \
16
+ libglib2.0-0 \
17
+ curl ca-certificates && \
18
+ rm -rf /var/lib/apt/lists/*
19
+
20
+ # Install uv
21
+ RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
22
+ /root/.cargo/bin/uv --version
23
+
24
+ WORKDIR /app
25
+
26
+ # Copy project metadata and source
27
+ COPY pyproject.toml /app/pyproject.toml
28
+ COPY src /app/src
29
+
30
+ # Sync dependencies into project env
31
+ RUN /root/.cargo/bin/uv sync --frozen
32
+
33
+ # Create cache volume for EasyOCR
34
+ VOLUME ["/cache"]
35
+
36
+ # Expose and healthcheck
37
+ EXPOSE 7860
38
+ HEALTHCHECK --interval=20s --timeout=5s --start-period=10s --retries=5 \
39
+ CMD sh -c 'curl -fsS http://127.0.0.1:${PORT:-7860}/ping > /dev/null || exit 1'
40
+
41
+ ENV PORT=7860
42
+ CMD ["/root/.cargo/bin/uv", "run", "uvicorn", "ocr_service.app:app", "--host", "0.0.0.0", "--port", "${PORT}"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Docling Serializer
3
+ emoji: 🏢
4
+ colorFrom: purple
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ license: apache-2.0
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
git ADDED
File without changes
pyproject.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "ocr-service"
3
+ version = "0.1.0"
4
+ description = "FastAPI OCR service with EasyOCR and OnnxTR (stub)"
5
+ requires-python = ">=3.10"
6
+ dependencies = [
7
+ "fastapi>=0.115.0",
8
+ "uvicorn[standard]>=0.30.0",
9
+ "easyocr>=1.7.2",
10
+ "opencv-python-headless>=4.10.0.84",
11
+ "pymupdf>=1.24.10",
12
+ "pillow>=10.3.0",
13
+ "python-multipart>=0.0.9",
14
+ "pydantic>=2.7.0",
15
+ "requests>=2.32.0"
16
+ ]
17
+
18
+ [tool.uv]
19
+ index-url = "https://pypi.org/simple"
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
src/ocr_service/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __all__ = []
src/ocr_service/app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ from enum import Enum
4
+ from typing import List
5
+
6
+ import requests
7
+ import fitz # PyMuPDF
8
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
9
+ from fastapi.responses import JSONResponse
10
+
11
+ app = FastAPI(title="OCR Service", version="0.1.0")
12
+
13
+ class OcrEngine(str, Enum):
14
+ EASYOCR = "easyocr"
15
+ ONNXTR = "onnxtr"
16
+
17
+
18
+ def get_easyocr_reader(lang: str = "en"):
19
+ import easyocr
20
+ cache_dir = os.environ.get("EASYOCR_CACHE", "/cache/EasyOcr")
21
+ os.makedirs(cache_dir, exist_ok=True)
22
+ return easyocr.Reader([lang], gpu=True, model_storage_directory=cache_dir)
23
+
24
+
25
+ def pdf_bytes_to_images(pdf_bytes: bytes, dpi: int = 200) -> List[bytes]:
26
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
27
+ images: List[bytes] = []
28
+ for page in doc:
29
+ mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
30
+ pix = page.get_pixmap(matrix=mat)
31
+ images.append(pix.tobytes("png"))
32
+ doc.close()
33
+ return images
34
+
35
+
36
+ @app.get("/ping")
37
+ async def ping() -> dict:
38
+ return {"status": "ok"}
39
+
40
+
41
+ @app.post("/ocr/url")
42
+ async def ocr_from_url(url: str = Form(...), engine: OcrEngine = Form(OcrEngine.EASYOCR), lang: str = Form("en")):
43
+ if engine == OcrEngine.ONNXTR:
44
+ raise HTTPException(status_code=501, detail="OnnxTR engine not yet wired. Use engine=easyocr.")
45
+ try:
46
+ resp = requests.get(url, timeout=30)
47
+ resp.raise_for_status()
48
+ content = resp.content
49
+ reader = get_easyocr_reader(lang)
50
+ lines: List[str] = []
51
+ if resp.headers.get("content-type", "").lower().startswith("application/pdf") or url.lower().endswith(".pdf"):
52
+ for img in pdf_bytes_to_images(content):
53
+ lines.extend(reader.readtext(img, detail=0))
54
+ else:
55
+ lines = reader.readtext(content, detail=0)
56
+ return {"engine": engine, "lang": lang, "lines": lines}
57
+ except Exception as e:
58
+ raise HTTPException(status_code=400, detail=str(e))
59
+
60
+
61
+ @app.post("/ocr/file")
62
+ async def ocr_from_file(file: UploadFile = File(...), engine: OcrEngine = Form(OcrEngine.EASYOCR), lang: str = Form("en")):
63
+ if engine == OcrEngine.ONNXTR:
64
+ raise HTTPException(status_code=501, detail="OnnxTR engine not yet wired. Use engine=easyocr.")
65
+ try:
66
+ data = await file.read()
67
+ reader = get_easyocr_reader(lang)
68
+ lines: List[str] = []
69
+ if file.filename.lower().endswith(".pdf"):
70
+ for img in pdf_bytes_to_images(data):
71
+ lines.extend(reader.readtext(img, detail=0))
72
+ else:
73
+ lines = reader.readtext(data, detail=0)
74
+ return {"engine": engine, "lang": lang, "lines": lines}
75
+ except Exception as e:
76
+ raise HTTPException(status_code=400, detail=str(e))