sammoftah commited on
Commit
080adfc
·
verified ·
1 Parent(s): 9bf4536

Add OCR fallback for scanned PDFs

Browse files
Files changed (2) hide show
  1. app.py +80 -10
  2. requirements.txt +2 -0
app.py CHANGED
@@ -8,15 +8,26 @@ import math
8
  import re
9
  from collections import Counter
10
 
 
 
 
 
 
11
  try:
12
  import fitz # PyMuPDF
13
  except Exception: # pragma: no cover - optional runtime fallback
14
  fitz = None
15
 
 
 
 
 
 
16
  sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
17
  from shared.components import create_method_panel, create_premium_hero
18
 
19
  client = InferenceClient(token=os.getenv("HF_TOKEN"))
 
20
 
21
  # Global storage
22
  chunks = []
@@ -71,23 +82,78 @@ def extract_with_pypdf(payload):
71
  def extract_with_pymupdf(payload):
72
  """Second-pass extraction for PDFs PyPDF2 parses poorly."""
73
  if fitz is None:
74
- return ""
75
 
76
  text = ""
77
  with fitz.open(stream=payload, filetype="pdf") as document:
78
  for page in document:
79
  text += page.get_text("text") + "\n"
80
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  def extract_text_from_pdf(pdf_file):
83
- """Extract embedded text from a PDF upload."""
84
  payload, source_name = read_uploaded_pdf(pdf_file)
85
  text = extract_with_pypdf(payload).strip()
 
 
 
 
 
 
 
 
86
 
87
  if len(text.split()) < 5:
88
- text = extract_with_pymupdf(payload).strip()
 
 
 
89
 
90
- return text, source_name
91
 
92
  def chunk_text(text, chunk_size=500, overlap=50):
93
  """Split text into overlapping chunks."""
@@ -114,7 +180,7 @@ def process_pdfs(pdf_files, progress=gr.Progress()):
114
  progress(0, desc="Extracting text from PDFs...")
115
  for i, pdf_file in enumerate(pdf_files):
116
  try:
117
- text, source_name = extract_text_from_pdf(pdf_file)
118
  except Exception as exc:
119
  return f"❌ Could not read PDF: {exc}"
120
  pdf_chunks = chunk_text(text)
@@ -122,10 +188,14 @@ def process_pdfs(pdf_files, progress=gr.Progress()):
122
  sources.extend([source_name] * len(pdf_chunks))
123
  word_count = len(text.split())
124
  if word_count:
125
- extraction_notes.append(f"- {source_name}: {word_count:,} words extracted")
 
 
 
126
  else:
 
127
  extraction_notes.append(
128
- f"- {source_name}: no embedded text found. This usually means the PDF is scanned/image-only and needs OCR."
129
  )
130
  progress((i + 1) / len(pdf_files), desc=f"Processed {i+1}/{len(pdf_files)} PDFs")
131
 
@@ -133,8 +203,8 @@ def process_pdfs(pdf_files, progress=gr.Progress()):
133
  return (
134
  "❌ No text extracted from PDFs\n\n"
135
  + "\n".join(extraction_notes)
136
- + "\n\nTry a text-based PDF, or run OCR first with a tool such as Adobe OCR, macOS Preview/Live Text export, "
137
- "Google Drive OCR, or `ocrmypdf`, then upload the searchable PDF."
138
  )
139
 
140
  progress(0.7, desc="Building lexical retrieval index...")
 
8
  import re
9
  from collections import Counter
10
 
11
+ try:
12
+ import numpy as np
13
+ except Exception: # pragma: no cover - optional runtime fallback
14
+ np = None
15
+
16
  try:
17
  import fitz # PyMuPDF
18
  except Exception: # pragma: no cover - optional runtime fallback
19
  fitz = None
20
 
21
+ try:
22
+ from rapidocr_onnxruntime import RapidOCR
23
+ except Exception: # pragma: no cover - optional runtime fallback
24
+ RapidOCR = None
25
+
26
  sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
27
  from shared.components import create_method_panel, create_premium_hero
28
 
29
  client = InferenceClient(token=os.getenv("HF_TOKEN"))
30
+ ocr_engine = None
31
 
32
  # Global storage
33
  chunks = []
 
82
  def extract_with_pymupdf(payload):
83
  """Second-pass extraction for PDFs PyPDF2 parses poorly."""
84
  if fitz is None:
85
+ return "", 0
86
 
87
  text = ""
88
  with fitz.open(stream=payload, filetype="pdf") as document:
89
  for page in document:
90
  text += page.get_text("text") + "\n"
91
+ page_count = document.page_count
92
+ return text, page_count
93
+
94
+ def get_ocr_engine():
95
+ """Lazily initialize OCR so normal text PDFs stay fast."""
96
+ global ocr_engine
97
+ if RapidOCR is None:
98
+ return None
99
+ if ocr_engine is None:
100
+ ocr_engine = RapidOCR()
101
+ return ocr_engine
102
+
103
+ def extract_with_ocr(payload, max_pages=12):
104
+ """Render PDF pages and OCR them when no embedded text exists."""
105
+ if fitz is None or np is None:
106
+ return "", 0, "OCR dependencies are not available in this runtime."
107
+
108
+ engine = get_ocr_engine()
109
+ if engine is None:
110
+ return "", 0, "OCR engine is not available in this runtime."
111
+
112
+ ocr_text = []
113
+ pages_processed = 0
114
+ with fitz.open(stream=payload, filetype="pdf") as document:
115
+ page_limit = min(document.page_count, max_pages)
116
+ for page_index in range(page_limit):
117
+ page = document.load_page(page_index)
118
+ pixmap = page.get_pixmap(matrix=fitz.Matrix(2, 2), alpha=False)
119
+ image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(
120
+ pixmap.height,
121
+ pixmap.width,
122
+ pixmap.n,
123
+ )
124
+ result, _ = engine(image)
125
+ if result:
126
+ lines = [line[1] for line in result if len(line) > 1 and line[1]]
127
+ ocr_text.append("\n".join(lines))
128
+ pages_processed += 1
129
+
130
+ if document.page_count > max_pages:
131
+ ocr_text.append(
132
+ f"\n[OCR note: processed first {max_pages} of {document.page_count} pages to keep the Space responsive.]"
133
+ )
134
+
135
+ return "\n".join(ocr_text), pages_processed, ""
136
 
137
  def extract_text_from_pdf(pdf_file):
138
+ """Extract text from a PDF upload, using OCR when no text layer exists."""
139
  payload, source_name = read_uploaded_pdf(pdf_file)
140
  text = extract_with_pypdf(payload).strip()
141
+ method = "PyPDF2 text layer"
142
+ page_count = 0
143
+ warning = ""
144
+
145
+ if len(text.split()) < 5:
146
+ text, page_count = extract_with_pymupdf(payload)
147
+ text = text.strip()
148
+ method = "PyMuPDF text layer"
149
 
150
  if len(text.split()) < 5:
151
+ max_pages = int(os.getenv("OCR_MAX_PAGES", "12"))
152
+ text, pages_processed, warning = extract_with_ocr(payload, max_pages=max_pages)
153
+ text = text.strip()
154
+ method = f"OCR over rendered PDF pages ({pages_processed} page{'s' if pages_processed != 1 else ''})"
155
 
156
+ return text, source_name, method, warning, page_count
157
 
158
  def chunk_text(text, chunk_size=500, overlap=50):
159
  """Split text into overlapping chunks."""
 
180
  progress(0, desc="Extracting text from PDFs...")
181
  for i, pdf_file in enumerate(pdf_files):
182
  try:
183
+ text, source_name, method, warning, page_count = extract_text_from_pdf(pdf_file)
184
  except Exception as exc:
185
  return f"❌ Could not read PDF: {exc}"
186
  pdf_chunks = chunk_text(text)
 
188
  sources.extend([source_name] * len(pdf_chunks))
189
  word_count = len(text.split())
190
  if word_count:
191
+ note = f"- {source_name}: {word_count:,} words extracted via {method}"
192
+ if warning:
193
+ note += f" ({warning})"
194
+ extraction_notes.append(note)
195
  else:
196
+ detail = warning or "no text layer or OCR-readable text was found"
197
  extraction_notes.append(
198
+ f"- {source_name}: {detail}."
199
  )
200
  progress((i + 1) / len(pdf_files), desc=f"Processed {i+1}/{len(pdf_files)} PDFs")
201
 
 
203
  return (
204
  "❌ No text extracted from PDFs\n\n"
205
  + "\n".join(extraction_notes)
206
+ + "\n\nThis Space now tries text extraction and OCR automatically. If this still fails, the PDF may contain "
207
+ "low-resolution images, protected content, or pages whose text is too blurred for OCR."
208
  )
209
 
210
  progress(0.7, desc="Building lexical retrieval index...")
requirements.txt CHANGED
@@ -2,3 +2,5 @@ gradio>=4.0.0
2
  huggingface-hub>=0.25.0
3
  PyPDF2==3.0.1
4
  PyMuPDF>=1.24.0
 
 
 
2
  huggingface-hub>=0.25.0
3
  PyPDF2==3.0.1
4
  PyMuPDF>=1.24.0
5
+ numpy>=1.26.0
6
+ rapidocr-onnxruntime>=1.3.24