Spaces:

internationalscholarsprogram
/

handbook-ocr-engine

Sleeping

App Files Files Community

internationalscholarsprogram commited on Apr 20

Commit

f04bfbe

verified ·

1 Parent(s): dc9cb16

Fix: convert image files (WebP, etc.) to PDF before extraction

Browse files

Files changed (1) hide show

app/services/extraction_pipeline.py +46 -0

app/services/extraction_pipeline.py CHANGED Viewed

@@ -31,11 +31,48 @@ from app.services.ocr_extractor import ocr_page, tesseract_available
 logger = logging.getLogger(__name__)
 def extract_plain(pdf_path: str | Path) -> PlainExtractionResult:
     """Quick plain-text extraction (no structure, no tables)."""
     t0 = time.monotonic()
     p = Path(pdf_path)
     settings = get_settings()
     doc_id = uuid.uuid4().hex[:16]
     metadata = extract_metadata(p)
@@ -69,6 +106,15 @@ def extract_structured(pdf_path: str | Path) -> ExtractionResult:
     """Full structured extraction: text + OCR with table detection."""
     t0 = time.monotonic()
     p = Path(pdf_path)
     settings = get_settings()
     doc_id = uuid.uuid4().hex[:16]
     metadata = extract_metadata(p)

 logger = logging.getLogger(__name__)
+_IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}
+def _ensure_pdf(path: Path) -> tuple[Path, bool]:
+    """If *path* is an image, convert it to a single-page PDF and return (pdf_path, True).
+    For PDFs, return (path, False) unchanged."""
+    if path.suffix.lower() not in _IMAGE_EXTENSIONS:
+        return path, False
+    import fitz
+    from PIL import Image
+    import io, tempfile, os
+    img = Image.open(path).convert("RGB")
+    buf = io.BytesIO()
+    img.save(buf, format="PNG")
+    buf.seek(0)
+    doc = fitz.open()
+    page = doc.new_page(width=img.width, height=img.height)
+    page.insert_image(fitz.Rect(0, 0, img.width, img.height), stream=buf.read())
+    tmp = tempfile.NamedTemporaryFile(suffix=".pdf", dir=path.parent, delete=False)
+    doc.save(tmp.name)
+    doc.close()
+    tmp.close()
+    return Path(tmp.name), True
 def extract_plain(pdf_path: str | Path) -> PlainExtractionResult:
     """Quick plain-text extraction (no structure, no tables)."""
     t0 = time.monotonic()
     p = Path(pdf_path)
+    p, converted = _ensure_pdf(p)
+    try:
+        return _extract_plain_inner(p, t0)
+    finally:
+        if converted:
+            p.unlink(missing_ok=True)
+def _extract_plain_inner(p: Path, t0: float) -> PlainExtractionResult:
     settings = get_settings()
     doc_id = uuid.uuid4().hex[:16]
     metadata = extract_metadata(p)
     """Full structured extraction: text + OCR with table detection."""
     t0 = time.monotonic()
     p = Path(pdf_path)
+    p, converted = _ensure_pdf(p)
+    try:
+        return _extract_structured_inner(p, t0)
+    finally:
+        if converted:
+            p.unlink(missing_ok=True)
+def _extract_structured_inner(p: Path, t0: float) -> ExtractionResult:
     settings = get_settings()
     doc_id = uuid.uuid4().hex[:16]
     metadata = extract_metadata(p)