internationalscholarsprogram commited on
Commit
f04bfbe
·
verified ·
1 Parent(s): dc9cb16

Fix: convert image files (WebP, etc.) to PDF before extraction

Browse files
Files changed (1) hide show
  1. app/services/extraction_pipeline.py +46 -0
app/services/extraction_pipeline.py CHANGED
@@ -31,11 +31,48 @@ from app.services.ocr_extractor import ocr_page, tesseract_available
31
 
32
  logger = logging.getLogger(__name__)
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def extract_plain(pdf_path: str | Path) -> PlainExtractionResult:
36
  """Quick plain-text extraction (no structure, no tables)."""
37
  t0 = time.monotonic()
38
  p = Path(pdf_path)
 
 
 
 
 
 
 
 
 
39
  settings = get_settings()
40
  doc_id = uuid.uuid4().hex[:16]
41
  metadata = extract_metadata(p)
@@ -69,6 +106,15 @@ def extract_structured(pdf_path: str | Path) -> ExtractionResult:
69
  """Full structured extraction: text + OCR with table detection."""
70
  t0 = time.monotonic()
71
  p = Path(pdf_path)
 
 
 
 
 
 
 
 
 
72
  settings = get_settings()
73
  doc_id = uuid.uuid4().hex[:16]
74
  metadata = extract_metadata(p)
 
31
 
32
  logger = logging.getLogger(__name__)
33
 
34
+ _IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}
35
+
36
+
37
+ def _ensure_pdf(path: Path) -> tuple[Path, bool]:
38
+ """If *path* is an image, convert it to a single-page PDF and return (pdf_path, True).
39
+ For PDFs, return (path, False) unchanged."""
40
+ if path.suffix.lower() not in _IMAGE_EXTENSIONS:
41
+ return path, False
42
+
43
+ import fitz
44
+ from PIL import Image
45
+ import io, tempfile, os
46
+
47
+ img = Image.open(path).convert("RGB")
48
+ buf = io.BytesIO()
49
+ img.save(buf, format="PNG")
50
+ buf.seek(0)
51
+
52
+ doc = fitz.open()
53
+ page = doc.new_page(width=img.width, height=img.height)
54
+ page.insert_image(fitz.Rect(0, 0, img.width, img.height), stream=buf.read())
55
+
56
+ tmp = tempfile.NamedTemporaryFile(suffix=".pdf", dir=path.parent, delete=False)
57
+ doc.save(tmp.name)
58
+ doc.close()
59
+ tmp.close()
60
+ return Path(tmp.name), True
61
+
62
 
63
  def extract_plain(pdf_path: str | Path) -> PlainExtractionResult:
64
  """Quick plain-text extraction (no structure, no tables)."""
65
  t0 = time.monotonic()
66
  p = Path(pdf_path)
67
+ p, converted = _ensure_pdf(p)
68
+ try:
69
+ return _extract_plain_inner(p, t0)
70
+ finally:
71
+ if converted:
72
+ p.unlink(missing_ok=True)
73
+
74
+
75
+ def _extract_plain_inner(p: Path, t0: float) -> PlainExtractionResult:
76
  settings = get_settings()
77
  doc_id = uuid.uuid4().hex[:16]
78
  metadata = extract_metadata(p)
 
106
  """Full structured extraction: text + OCR with table detection."""
107
  t0 = time.monotonic()
108
  p = Path(pdf_path)
109
+ p, converted = _ensure_pdf(p)
110
+ try:
111
+ return _extract_structured_inner(p, t0)
112
+ finally:
113
+ if converted:
114
+ p.unlink(missing_ok=True)
115
+
116
+
117
+ def _extract_structured_inner(p: Path, t0: float) -> ExtractionResult:
118
  settings = get_settings()
119
  doc_id = uuid.uuid4().hex[:16]
120
  metadata = extract_metadata(p)