redhairedshanks1 commited on
Commit
cbaab47
·
verified ·
1 Parent(s): 5d43a8b

Update services/extract_text.py

Browse files
Files changed (1) hide show
  1. services/extract_text.py +34 -15
services/extract_text.py CHANGED
@@ -175,7 +175,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
175
  # OCR
176
  from paddleocr import PaddleOCR
177
 
178
- # Optional Mistral OCR
179
  try:
180
  from doctr.models import ocr_predictor
181
  from doctr.io import DocumentFile
@@ -196,9 +196,13 @@ logger = logging.getLogger(__name__)
196
  # PaddleOCR
197
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
198
 
 
 
 
199
  def clean_text(text):
200
  return re.sub(r'\s+', ' ', text).strip()
201
 
 
202
  def auto_rotate_image(pil_img):
203
  """Auto-rotate PIL image safely."""
204
  if pil_img.mode != "RGB":
@@ -216,6 +220,7 @@ def auto_rotate_image(pil_img):
216
  borderMode=cv2.BORDER_REPLICATE)
217
  return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
218
 
 
219
  def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
220
  images = []
221
  try:
@@ -237,21 +242,21 @@ def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
237
  return images
238
 
239
 
240
- # -------------------- Parallel Extraction Wrapper --------------------
241
 
242
  def try_pymupdf_text(doc, start, end):
243
- """Try extracting text using PyMuPDF native text extraction"""
244
  result = []
245
  for i in range(start-1, end):
246
- page = doc[i]
247
- text = page.get_text()
248
- if text.strip():
249
- result.append(f"Page {i+1}:\n{clean_text(text)}")
 
 
250
  return "\n\n".join(result)
251
 
252
 
253
  def try_paddleocr(images):
254
- """Try OCR using PaddleOCR"""
255
  result = []
256
  for page_num, img in images:
257
  img = auto_rotate_image(img)
@@ -259,14 +264,14 @@ def try_paddleocr(images):
259
  try:
260
  ocr_result = ocr.ocr(img_np, cls=True)
261
  ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
262
- result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
 
263
  except Exception as e:
264
  logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
265
  return "\n\n".join(result)
266
 
267
 
268
  def try_mistralocr(images):
269
- """Try OCR using Mistral/Doctr OCR"""
270
  if not use_mistral_ocr:
271
  return ""
272
  result = []
@@ -274,7 +279,8 @@ def try_mistralocr(images):
274
  try:
275
  doc_img = DocumentFile.from_images(img)
276
  ocr_text = mistral_ocr(doc_img).render()
277
- result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
 
278
  except Exception as e:
279
  logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
280
  return "\n\n".join(result)
@@ -297,6 +303,7 @@ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
297
  end = min(end_page or total_pages, total_pages)
298
  images = extract_images_with_fitz(file.name, start, end)
299
 
 
300
  tasks = {}
301
  with ThreadPoolExecutor() as executor:
302
  tasks[executor.submit(try_pymupdf_text, doc, start, end)] = "PyMuPDF"
@@ -310,23 +317,34 @@ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
310
  try:
311
  text = future.result()
312
  results[method] = text
 
313
  except Exception as e:
314
  logger.error(f"{method} failed: {e}")
315
  results[method] = ""
316
 
317
  doc.close()
318
 
319
- # Pick the longest text among the methods
320
- best_method, best_text = max(results.items(), key=lambda kv: len(kv[1].strip()))
321
- logger.info(f"Best extraction chosen: {best_method} (length {len(best_text)})")
322
- return best_text or "[No text extracted]"
 
 
 
 
 
 
 
 
323
 
 
324
  elif ext == ".docx":
325
  from docx import Document
326
  doc = Document(file.name)
327
  paras = [p.text for p in doc.paragraphs if p.text.strip()]
328
  return clean_text("\n".join(paras))
329
 
 
330
  elif ext == ".csv":
331
  import pandas as pd
332
  try:
@@ -335,6 +353,7 @@ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
335
  logger.error(f"CSV read error: {e}")
336
  return "[CSV Read Error]"
337
 
 
338
  elif ext in [".xls", ".xlsx"]:
339
  import pandas as pd
340
  try:
 
175
  # OCR
176
  from paddleocr import PaddleOCR
177
 
178
+ # Optional Doctr OCR
179
  try:
180
  from doctr.models import ocr_predictor
181
  from doctr.io import DocumentFile
 
196
  # PaddleOCR
197
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
198
 
199
+
200
+ # -------------------- Helpers --------------------
201
+
202
  def clean_text(text):
203
  return re.sub(r'\s+', ' ', text).strip()
204
 
205
+
206
  def auto_rotate_image(pil_img):
207
  """Auto-rotate PIL image safely."""
208
  if pil_img.mode != "RGB":
 
220
  borderMode=cv2.BORDER_REPLICATE)
221
  return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
222
 
223
+
224
  def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
225
  images = []
226
  try:
 
242
  return images
243
 
244
 
245
+ # -------------------- Extractors --------------------
246
 
247
  def try_pymupdf_text(doc, start, end):
 
248
  result = []
249
  for i in range(start-1, end):
250
+ try:
251
+ text = doc[i].get_text("text")
252
+ if text.strip():
253
+ result.append(f"Page {i+1}:\n{clean_text(text)}")
254
+ except Exception as e:
255
+ logger.warning(f"PyMuPDF failed on page {i+1}: {e}")
256
  return "\n\n".join(result)
257
 
258
 
259
  def try_paddleocr(images):
 
260
  result = []
261
  for page_num, img in images:
262
  img = auto_rotate_image(img)
 
264
  try:
265
  ocr_result = ocr.ocr(img_np, cls=True)
266
  ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
267
+ if ocr_text.strip():
268
+ result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
269
  except Exception as e:
270
  logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
271
  return "\n\n".join(result)
272
 
273
 
274
  def try_mistralocr(images):
 
275
  if not use_mistral_ocr:
276
  return ""
277
  result = []
 
279
  try:
280
  doc_img = DocumentFile.from_images(img)
281
  ocr_text = mistral_ocr(doc_img).render()
282
+ if ocr_text.strip():
283
+ result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
284
  except Exception as e:
285
  logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
286
  return "\n\n".join(result)
 
303
  end = min(end_page or total_pages, total_pages)
304
  images = extract_images_with_fitz(file.name, start, end)
305
 
306
+ # Run all methods in parallel
307
  tasks = {}
308
  with ThreadPoolExecutor() as executor:
309
  tasks[executor.submit(try_pymupdf_text, doc, start, end)] = "PyMuPDF"
 
317
  try:
318
  text = future.result()
319
  results[method] = text
320
+ logger.info(f"{method} produced {len(text.split())} words")
321
  except Exception as e:
322
  logger.error(f"{method} failed: {e}")
323
  results[method] = ""
324
 
325
  doc.close()
326
 
327
+ # Selection logic
328
+ best_method, best_text = max(
329
+ results.items(),
330
+ key=lambda kv: len(kv[1].split()) # choose longest by word count
331
+ )
332
+
333
+ logger.info(f"✅ Best extraction chosen: {best_method} "
334
+ f"(words: {len(best_text.split())})")
335
+
336
+ if not best_text.strip():
337
+ return "[No text extracted]"
338
+ return best_text
339
 
340
+ # DOCX
341
  elif ext == ".docx":
342
  from docx import Document
343
  doc = Document(file.name)
344
  paras = [p.text for p in doc.paragraphs if p.text.strip()]
345
  return clean_text("\n".join(paras))
346
 
347
+ # CSV
348
  elif ext == ".csv":
349
  import pandas as pd
350
  try:
 
353
  logger.error(f"CSV read error: {e}")
354
  return "[CSV Read Error]"
355
 
356
+ # Excel
357
  elif ext in [".xls", ".xlsx"]:
358
  import pandas as pd
359
  try: