redhairedshanks1 commited on
Commit
5d295a5
·
verified ·
1 Parent(s): 8ad0410

Update services/extract_text.py

Browse files
Files changed (1) hide show
  1. services/extract_text.py +79 -84
services/extract_text.py CHANGED
@@ -170,12 +170,12 @@ import numpy as np
170
  from PIL import Image
171
  import cv2
172
  import re
173
- from concurrent.futures import ThreadPoolExecutor, as_completed
174
 
175
  # OCR
176
  from paddleocr import PaddleOCR
177
 
178
- # Optional Doctr OCR
179
  try:
180
  from doctr.models import ocr_predictor
181
  from doctr.io import DocumentFile
@@ -197,8 +197,7 @@ logger = logging.getLogger(__name__)
197
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
198
 
199
 
200
- # -------------------- Helpers --------------------
201
-
202
  def clean_text(text):
203
  return re.sub(r'\s+', ' ', text).strip()
204
 
@@ -215,9 +214,9 @@ def auto_rotate_image(pil_img):
215
  angle = -(90 + angle) if angle < -45 else -angle
216
  (h, w) = img_cv.shape[:2]
217
  M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
218
- rotated = cv2.warpAffine(img_cv, M, (w, h),
219
- flags=cv2.INTER_CUBIC,
220
- borderMode=cv2.BORDER_REPLICATE)
221
  return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
222
 
223
 
@@ -242,55 +241,11 @@ def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
242
  return images
243
 
244
 
245
- # -------------------- Extractors --------------------
246
-
247
- def try_pymupdf_text(doc, start, end):
248
- result = []
249
- for i in range(start-1, end):
250
- try:
251
- text = doc[i].get_text("text")
252
- if text.strip():
253
- result.append(f"Page {i+1}:\n{clean_text(text)}")
254
- except Exception as e:
255
- logger.warning(f"PyMuPDF failed on page {i+1}: {e}")
256
- return "\n\n".join(result)
257
-
258
-
259
- def try_paddleocr(images):
260
- result = []
261
- for page_num, img in images:
262
- img = auto_rotate_image(img)
263
- img_np = np.array(img)
264
- try:
265
- ocr_result = ocr.ocr(img_np, cls=True)
266
- ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
267
- if ocr_text.strip():
268
- result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
269
- except Exception as e:
270
- logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
271
- return "\n\n".join(result)
272
-
273
-
274
- def try_mistralocr(images):
275
- if not use_mistral_ocr:
276
- return ""
277
- result = []
278
- for page_num, img in images:
279
- try:
280
- doc_img = DocumentFile.from_images(img)
281
- ocr_text = mistral_ocr(doc_img).render()
282
- if ocr_text.strip():
283
- result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
284
- except Exception as e:
285
- logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
286
- return "\n\n".join(result)
287
-
288
-
289
- # -------------------- Main Extractor --------------------
290
-
291
  def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
292
  ext = os.path.splitext(filename or "")[-1].lower()
293
 
 
294
  if ext == ".pdf":
295
  try:
296
  doc = fitz.open(file.name)
@@ -303,65 +258,105 @@ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
303
  end = min(end_page or total_pages, total_pages)
304
  images = extract_images_with_fitz(file.name, start, end)
305
 
306
- # Run all methods in parallel
307
- tasks = {}
308
- with ThreadPoolExecutor() as executor:
309
- tasks[executor.submit(try_pymupdf_text, doc, start, end)] = "PyMuPDF"
310
- tasks[executor.submit(try_paddleocr, images)] = "PaddleOCR"
311
- if use_mistral_ocr:
312
- tasks[executor.submit(try_mistralocr, images)] = "MistralOCR"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
- results = {}
315
- for future in as_completed(tasks):
316
- method = tasks[future]
317
  try:
318
- text = future.result()
319
- results[method] = text
320
- logger.info(f"{method} produced {len(text.split())} words")
321
  except Exception as e:
322
- logger.error(f"{method} failed: {e}")
323
- results[method] = ""
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  doc.close()
326
 
327
- # Append all outputs into one string
328
  final_output = []
329
- for method, text in results.items():
330
- final_output.append(f"===== Method: {method} =====\n{text or '[No text]'}\n")
 
 
 
331
 
332
  return "\n\n".join(final_output)
333
 
334
- # DOCX
335
  elif ext == ".docx":
336
  from docx import Document
337
  doc = Document(file.name)
338
  paras = [p.text for p in doc.paragraphs if p.text.strip()]
339
- return "===== Method: python-docx =====\n" + clean_text("\n".join(paras))
340
-
341
- # CSV
 
 
 
 
 
 
 
342
  elif ext == ".csv":
343
  import pandas as pd
344
  try:
345
- data = pd.read_csv(file.name).to_string(index=False)
346
- return "===== Method: pandas-csv =====\n" + data
347
  except Exception as e:
348
  logger.error(f"CSV read error: {e}")
349
  return "[CSV Read Error]"
350
 
351
- # Excel
352
  elif ext in [".xls", ".xlsx"]:
353
  import pandas as pd
354
  try:
355
  xl = pd.ExcelFile(file.name)
356
- text = "\n\n".join([
357
- f"Sheet: {s}\n{xl.parse(s).to_string(index=False)}"
358
- for s in xl.sheet_names
359
- ])
360
- return "===== Method: pandas-excel =====\n" + text
361
  except Exception as e:
362
  logger.error(f"Excel read error: {e}")
363
  return "[Excel Read Error]"
364
 
 
365
  else:
366
  return "[Unsupported file type]"
367
-
 
170
  from PIL import Image
171
  import cv2
172
  import re
173
+ import concurrent.futures
174
 
175
  # OCR
176
  from paddleocr import PaddleOCR
177
 
178
+ # Optional Mistral OCR
179
  try:
180
  from doctr.models import ocr_predictor
181
  from doctr.io import DocumentFile
 
197
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
198
 
199
 
200
+ # ========================= Helpers ==============================
 
201
  def clean_text(text):
202
  return re.sub(r'\s+', ' ', text).strip()
203
 
 
214
  angle = -(90 + angle) if angle < -45 else -angle
215
  (h, w) = img_cv.shape[:2]
216
  M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
217
+ rotated = cv2.warpAffine(
218
+ img_cv, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
219
+ )
220
  return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
221
 
222
 
 
241
  return images
242
 
243
 
244
+ # ========================= Extraction ==============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
246
  ext = os.path.splitext(filename or "")[-1].lower()
247
 
248
+ # ---------------- PDF -----------------
249
  if ext == ".pdf":
250
  try:
251
  doc = fitz.open(file.name)
 
258
  end = min(end_page or total_pages, total_pages)
259
  images = extract_images_with_fitz(file.name, start, end)
260
 
261
+ results = {"PyMuPDF": [], "PaddleOCR": [], "MistralOCR": []}
262
+
263
+ def process_page(i):
264
+ page_num = i + 1
265
+ page_results = {}
266
+
267
+ # --- PyMuPDF ---
268
+ pymupdf_text = ""
269
+ try:
270
+ pymupdf_text = clean_text(doc[i].get_text("text"))
271
+ except Exception as e:
272
+ logger.warning(f"PyMuPDF failed on page {page_num}: {e}")
273
+ if len(pymupdf_text.split()) > 5: # ignore tiny metadata
274
+ page_results["PyMuPDF"] = f"Page {page_num}:\n{pymupdf_text}"
275
+
276
+ # --- PaddleOCR ---
277
+ paddle_text = ""
278
+ try:
279
+ img = auto_rotate_image(images[i - (start - 1)][1])
280
+ img_np = np.array(img)
281
+ ocr_result = ocr.ocr(img_np, cls=True)
282
+ paddle_text = (
283
+ "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
284
+ )
285
+ paddle_text = clean_text(paddle_text)
286
+ except Exception as e:
287
+ logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
288
+ if paddle_text:
289
+ page_results["PaddleOCR"] = f"Page {page_num}:\n{paddle_text}"
290
 
291
+ # --- MistralOCR ---
292
+ mistral_text = ""
293
+ if use_mistral_ocr:
294
  try:
295
+ doc_img = DocumentFile.from_images(images[i - (start - 1)][1])
296
+ mistral_text = mistral_ocr(doc_img).render()
297
+ mistral_text = clean_text(mistral_text)
298
  except Exception as e:
299
+ logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
300
+ if mistral_text:
301
+ page_results["MistralOCR"] = f"Page {page_num}:\n{mistral_text}"
302
+
303
+ return page_results
304
+
305
+ # Run in parallel
306
+ with concurrent.futures.ThreadPoolExecutor() as executor:
307
+ futures = [executor.submit(process_page, i) for i in range(start - 1, end)]
308
+ for future in concurrent.futures.as_completed(futures):
309
+ page_results = future.result()
310
+ for method, text in page_results.items():
311
+ results[method].append(text)
312
 
313
  doc.close()
314
 
315
+ # Build final output (all methods separately)
316
  final_output = []
317
+ for method, texts in results.items():
318
+ if texts:
319
+ final_output.append(f"===== Method: {method} =====\n" + "\n\n".join(texts))
320
+ else:
321
+ final_output.append(f"===== Method: {method} =====\n[No text extracted]")
322
 
323
  return "\n\n".join(final_output)
324
 
325
+ # ---------------- DOCX -----------------
326
  elif ext == ".docx":
327
  from docx import Document
328
  doc = Document(file.name)
329
  paras = [p.text for p in doc.paragraphs if p.text.strip()]
330
+ page_texts = []
331
+ page_size = 500
332
+ for i in range(0, len(paras), page_size):
333
+ page_texts.append("\n".join(paras[i:i + page_size]))
334
+ selected_pages = page_texts
335
+ if start_page and end_page:
336
+ selected_pages = page_texts[start_page - 1:end_page]
337
+ return clean_text("\n\n".join(selected_pages))
338
+
339
+ # ---------------- CSV -----------------
340
  elif ext == ".csv":
341
  import pandas as pd
342
  try:
343
+ return pd.read_csv(file.name).to_string(index=False)
 
344
  except Exception as e:
345
  logger.error(f"CSV read error: {e}")
346
  return "[CSV Read Error]"
347
 
348
+ # ---------------- Excel -----------------
349
  elif ext in [".xls", ".xlsx"]:
350
  import pandas as pd
351
  try:
352
  xl = pd.ExcelFile(file.name)
353
+ return "\n\n".join(
354
+ [f"Sheet: {s}\n{xl.parse(s).to_string(index=False)}" for s in xl.sheet_names]
355
+ )
 
 
356
  except Exception as e:
357
  logger.error(f"Excel read error: {e}")
358
  return "[Excel Read Error]"
359
 
360
+ # ---------------- Others -----------------
361
  else:
362
  return "[Unsupported file type]"