Spaces:

sammoftah
/

rag-from-scratch

Sleeping

App Files Files Community

sammoftah commited on Apr 25

Commit

080adfc

verified ·

1 Parent(s): 9bf4536

Add OCR fallback for scanned PDFs

Browse files

Files changed (2) hide show

app.py +80 -10
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -8,15 +8,26 @@ import math
 import re
 from collections import Counter
 try:
     import fitz  # PyMuPDF
 except Exception:  # pragma: no cover - optional runtime fallback
     fitz = None
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from shared.components import create_method_panel, create_premium_hero
 client = InferenceClient(token=os.getenv("HF_TOKEN"))
 # Global storage
 chunks = []
@@ -71,23 +82,78 @@ def extract_with_pypdf(payload):
 def extract_with_pymupdf(payload):
     """Second-pass extraction for PDFs PyPDF2 parses poorly."""
     if fitz is None:
-        return ""
     text = ""
     with fitz.open(stream=payload, filetype="pdf") as document:
         for page in document:
             text += page.get_text("text") + "\n"
-    return text
 def extract_text_from_pdf(pdf_file):
-    """Extract embedded text from a PDF upload."""
     payload, source_name = read_uploaded_pdf(pdf_file)
     text = extract_with_pypdf(payload).strip()
     if len(text.split()) < 5:
-        text = extract_with_pymupdf(payload).strip()
-    return text, source_name
 def chunk_text(text, chunk_size=500, overlap=50):
     """Split text into overlapping chunks."""
@@ -114,7 +180,7 @@ def process_pdfs(pdf_files, progress=gr.Progress()):
     progress(0, desc="Extracting text from PDFs...")
     for i, pdf_file in enumerate(pdf_files):
         try:
-            text, source_name = extract_text_from_pdf(pdf_file)
         except Exception as exc:
             return f"❌ Could not read PDF: {exc}"
         pdf_chunks = chunk_text(text)
@@ -122,10 +188,14 @@ def process_pdfs(pdf_files, progress=gr.Progress()):
         sources.extend([source_name] * len(pdf_chunks))
         word_count = len(text.split())
         if word_count:
-            extraction_notes.append(f"- {source_name}: {word_count:,} words extracted")
         else:
             extraction_notes.append(
-                f"- {source_name}: no embedded text found. This usually means the PDF is scanned/image-only and needs OCR."
             )
         progress((i + 1) / len(pdf_files), desc=f"Processed {i+1}/{len(pdf_files)} PDFs")
@@ -133,8 +203,8 @@ def process_pdfs(pdf_files, progress=gr.Progress()):
         return (
             "❌ No text extracted from PDFs\n\n"
             + "\n".join(extraction_notes)
-            + "\n\nTry a text-based PDF, or run OCR first with a tool such as Adobe OCR, macOS Preview/Live Text export, "
-            "Google Drive OCR, or `ocrmypdf`, then upload the searchable PDF."
         )
     progress(0.7, desc="Building lexical retrieval index...")

 import re
 from collections import Counter
+try:
+    import numpy as np
+except Exception:  # pragma: no cover - optional runtime fallback
+    np = None
 try:
     import fitz  # PyMuPDF
 except Exception:  # pragma: no cover - optional runtime fallback
     fitz = None
+try:
+    from rapidocr_onnxruntime import RapidOCR
+except Exception:  # pragma: no cover - optional runtime fallback
+    RapidOCR = None
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from shared.components import create_method_panel, create_premium_hero
 client = InferenceClient(token=os.getenv("HF_TOKEN"))
+ocr_engine = None
 # Global storage
 chunks = []
 def extract_with_pymupdf(payload):
     """Second-pass extraction for PDFs PyPDF2 parses poorly."""
     if fitz is None:
+        return "", 0
     text = ""
     with fitz.open(stream=payload, filetype="pdf") as document:
         for page in document:
             text += page.get_text("text") + "\n"
+        page_count = document.page_count
+    return text, page_count
+def get_ocr_engine():
+    """Lazily initialize OCR so normal text PDFs stay fast."""
+    global ocr_engine
+    if RapidOCR is None:
+        return None
+    if ocr_engine is None:
+        ocr_engine = RapidOCR()
+    return ocr_engine
+def extract_with_ocr(payload, max_pages=12):
+    """Render PDF pages and OCR them when no embedded text exists."""
+    if fitz is None or np is None:
+        return "", 0, "OCR dependencies are not available in this runtime."
+    engine = get_ocr_engine()
+    if engine is None:
+        return "", 0, "OCR engine is not available in this runtime."
+    ocr_text = []
+    pages_processed = 0
+    with fitz.open(stream=payload, filetype="pdf") as document:
+        page_limit = min(document.page_count, max_pages)
+        for page_index in range(page_limit):
+            page = document.load_page(page_index)
+            pixmap = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
+            image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(
+                pixmap.height,
+                pixmap.width,
+                pixmap.n,
+            )
+            result, _ = engine(image)
+            if result:
+                lines = [line[1] for line in result if len(line) > 1 and line[1]]
+                ocr_text.append("\n".join(lines))
+            pages_processed += 1
+        if document.page_count > max_pages:
+            ocr_text.append(
+                f"\n[OCR note: processed first {max_pages} of {document.page_count} pages to keep the Space responsive.]"
+            )
+    return "\n".join(ocr_text), pages_processed, ""
 def extract_text_from_pdf(pdf_file):
+    """Extract text from a PDF upload, using OCR when no text layer exists."""
     payload, source_name = read_uploaded_pdf(pdf_file)
     text = extract_with_pypdf(payload).strip()
+    method = "PyPDF2 text layer"
+    page_count = 0
+    warning = ""
+    if len(text.split()) < 5:
+        text, page_count = extract_with_pymupdf(payload)
+        text = text.strip()
+        method = "PyMuPDF text layer"
     if len(text.split()) < 5:
+        max_pages = int(os.getenv("OCR_MAX_PAGES", "12"))
+        text, pages_processed, warning = extract_with_ocr(payload, max_pages=max_pages)
+        text = text.strip()
+        method = f"OCR over rendered PDF pages ({pages_processed} page{'s' if pages_processed != 1 else ''})"
+    return text, source_name, method, warning, page_count
 def chunk_text(text, chunk_size=500, overlap=50):
     """Split text into overlapping chunks."""
     progress(0, desc="Extracting text from PDFs...")
     for i, pdf_file in enumerate(pdf_files):
         try:
+            text, source_name, method, warning, page_count = extract_text_from_pdf(pdf_file)
         except Exception as exc:
             return f"❌ Could not read PDF: {exc}"
         pdf_chunks = chunk_text(text)
         sources.extend([source_name] * len(pdf_chunks))
         word_count = len(text.split())
         if word_count:
+            note = f"- {source_name}: {word_count:,} words extracted via {method}"
+            if warning:
+                note += f" ({warning})"
+            extraction_notes.append(note)
         else:
+            detail = warning or "no text layer or OCR-readable text was found"
             extraction_notes.append(
+                f"- {source_name}: {detail}."
             )
         progress((i + 1) / len(pdf_files), desc=f"Processed {i+1}/{len(pdf_files)} PDFs")
         return (
             "❌ No text extracted from PDFs\n\n"
             + "\n".join(extraction_notes)
+            + "\n\nThis Space now tries text extraction and OCR automatically. If this still fails, the PDF may contain "
+            "low-resolution images, protected content, or pages whose text is too blurred for OCR."
         )
     progress(0.7, desc="Building lexical retrieval index...")

requirements.txt CHANGED Viewed

@@ -2,3 +2,5 @@ gradio>=4.0.0
 huggingface-hub>=0.25.0
 PyPDF2==3.0.1
 PyMuPDF>=1.24.0

 huggingface-hub>=0.25.0
 PyPDF2==3.0.1
 PyMuPDF>=1.24.0
+numpy>=1.26.0
+rapidocr-onnxruntime>=1.3.24