Spaces:

redhairedshanks1
/

Extract-Text-and-Table

Paused

App Files Files Community

redhairedshanks1 commited on Aug 20, 2025

Commit

5d295a5

verified ·

1 Parent(s): 8ad0410

Update services/extract_text.py

Browse files

Files changed (1) hide show

services/extract_text.py +79 -84

services/extract_text.py CHANGED Viewed

@@ -170,12 +170,12 @@ import numpy as np
 from PIL import Image
 import cv2
 import re
-from concurrent.futures import ThreadPoolExecutor, as_completed
 # OCR
 from paddleocr import PaddleOCR
-# Optional Doctr OCR
 try:
     from doctr.models import ocr_predictor
     from doctr.io import DocumentFile
@@ -197,8 +197,7 @@ logger = logging.getLogger(__name__)
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
-# -------------------- Helpers --------------------
 def clean_text(text):
     return re.sub(r'\s+', ' ', text).strip()
@@ -215,9 +214,9 @@ def auto_rotate_image(pil_img):
     angle = -(90 + angle) if angle < -45 else -angle
     (h, w) = img_cv.shape[:2]
     M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
-    rotated = cv2.warpAffine(img_cv, M, (w, h),
-                             flags=cv2.INTER_CUBIC,
-                             borderMode=cv2.BORDER_REPLICATE)
     return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
@@ -242,55 +241,11 @@ def extract_images_with_fitz(pdf_path, start_page=1, end_page=None):
     return images
-# -------------------- Extractors --------------------
-def try_pymupdf_text(doc, start, end):
-    result = []
-    for i in range(start-1, end):
-        try:
-            text = doc[i].get_text("text")
-            if text.strip():
-                result.append(f"Page {i+1}:\n{clean_text(text)}")
-        except Exception as e:
-            logger.warning(f"PyMuPDF failed on page {i+1}: {e}")
-    return "\n\n".join(result)
-def try_paddleocr(images):
-    result = []
-    for page_num, img in images:
-        img = auto_rotate_image(img)
-        img_np = np.array(img)
-        try:
-            ocr_result = ocr.ocr(img_np, cls=True)
-            ocr_text = "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
-            if ocr_text.strip():
-                result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
-        except Exception as e:
-            logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
-    return "\n\n".join(result)
-def try_mistralocr(images):
-    if not use_mistral_ocr:
-        return ""
-    result = []
-    for page_num, img in images:
-        try:
-            doc_img = DocumentFile.from_images(img)
-            ocr_text = mistral_ocr(doc_img).render()
-            if ocr_text.strip():
-                result.append(f"Page {page_num}:\n{clean_text(ocr_text)}")
-        except Exception as e:
-            logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
-    return "\n\n".join(result)
-# -------------------- Main Extractor --------------------
 def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
     ext = os.path.splitext(filename or "")[-1].lower()
     if ext == ".pdf":
         try:
             doc = fitz.open(file.name)
@@ -303,65 +258,105 @@ def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
         end = min(end_page or total_pages, total_pages)
         images = extract_images_with_fitz(file.name, start, end)
-        # Run all methods in parallel
-        tasks = {}
-        with ThreadPoolExecutor() as executor:
-            tasks[executor.submit(try_pymupdf_text, doc, start, end)] = "PyMuPDF"
-            tasks[executor.submit(try_paddleocr, images)] = "PaddleOCR"
-            if use_mistral_ocr:
-                tasks[executor.submit(try_mistralocr, images)] = "MistralOCR"
-            results = {}
-            for future in as_completed(tasks):
-                method = tasks[future]
                 try:
-                    text = future.result()
-                    results[method] = text
-                    logger.info(f"{method} produced {len(text.split())} words")
                 except Exception as e:
-                    logger.error(f"{method} failed: {e}")
-                    results[method] = ""
         doc.close()
-        # Append all outputs into one string
         final_output = []
-        for method, text in results.items():
-            final_output.append(f"===== Method: {method} =====\n{text or '[No text]'}\n")
         return "\n\n".join(final_output)
-    # DOCX
     elif ext == ".docx":
         from docx import Document
         doc = Document(file.name)
         paras = [p.text for p in doc.paragraphs if p.text.strip()]
-        return "===== Method: python-docx =====\n" + clean_text("\n".join(paras))
-    # CSV
     elif ext == ".csv":
         import pandas as pd
         try:
-            data = pd.read_csv(file.name).to_string(index=False)
-            return "===== Method: pandas-csv =====\n" + data
         except Exception as e:
             logger.error(f"CSV read error: {e}")
             return "[CSV Read Error]"
-    # Excel
     elif ext in [".xls", ".xlsx"]:
         import pandas as pd
         try:
             xl = pd.ExcelFile(file.name)
-            text = "\n\n".join([
-                f"Sheet: {s}\n{xl.parse(s).to_string(index=False)}"
-                for s in xl.sheet_names
-            ])
-            return "===== Method: pandas-excel =====\n" + text
         except Exception as e:
             logger.error(f"Excel read error: {e}")
             return "[Excel Read Error]"
     else:
         return "[Unsupported file type]"

 from PIL import Image
 import cv2
 import re
+import concurrent.futures
 # OCR
 from paddleocr import PaddleOCR
+# Optional Mistral OCR
 try:
     from doctr.models import ocr_predictor
     from doctr.io import DocumentFile
 ocr = PaddleOCR(use_angle_cls=True, lang='en')
+# ========================= Helpers ==============================
 def clean_text(text):
     return re.sub(r'\s+', ' ', text).strip()
     angle = -(90 + angle) if angle < -45 else -angle
     (h, w) = img_cv.shape[:2]
     M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
+    rotated = cv2.warpAffine(
+        img_cv, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE
+    )
     return Image.fromarray(cv2.cvtColor(rotated, cv2.COLOR_GRAY2RGB))
     return images
+# ========================= Extraction ==============================
 def extract_text_from_file(file, start_page=None, end_page=None, filename=None):
     ext = os.path.splitext(filename or "")[-1].lower()
+    # ---------------- PDF -----------------
     if ext == ".pdf":
         try:
             doc = fitz.open(file.name)
         end = min(end_page or total_pages, total_pages)
         images = extract_images_with_fitz(file.name, start, end)
+        results = {"PyMuPDF": [], "PaddleOCR": [], "MistralOCR": []}
+        def process_page(i):
+            page_num = i + 1
+            page_results = {}
+            # --- PyMuPDF ---
+            pymupdf_text = ""
+            try:
+                pymupdf_text = clean_text(doc[i].get_text("text"))
+            except Exception as e:
+                logger.warning(f"PyMuPDF failed on page {page_num}: {e}")
+            if len(pymupdf_text.split()) > 5:  # ignore tiny metadata
+                page_results["PyMuPDF"] = f"Page {page_num}:\n{pymupdf_text}"
+            # --- PaddleOCR ---
+            paddle_text = ""
+            try:
+                img = auto_rotate_image(images[i - (start - 1)][1])
+                img_np = np.array(img)
+                ocr_result = ocr.ocr(img_np, cls=True)
+                paddle_text = (
+                    "\n".join([line[1][0] for line in ocr_result[0]]) if ocr_result else ""
+                )
+                paddle_text = clean_text(paddle_text)
+            except Exception as e:
+                logger.warning(f"PaddleOCR failed on page {page_num}: {e}")
+            if paddle_text:
+                page_results["PaddleOCR"] = f"Page {page_num}:\n{paddle_text}"
+            # --- MistralOCR ---
+            mistral_text = ""
+            if use_mistral_ocr:
                 try:
+                    doc_img = DocumentFile.from_images(images[i - (start - 1)][1])
+                    mistral_text = mistral_ocr(doc_img).render()
+                    mistral_text = clean_text(mistral_text)
                 except Exception as e:
+                    logger.warning(f"Mistral OCR failed on page {page_num}: {e}")
+            if mistral_text:
+                page_results["MistralOCR"] = f"Page {page_num}:\n{mistral_text}"
+            return page_results
+        # Run in parallel
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [executor.submit(process_page, i) for i in range(start - 1, end)]
+            for future in concurrent.futures.as_completed(futures):
+                page_results = future.result()
+                for method, text in page_results.items():
+                    results[method].append(text)
         doc.close()
+        # Build final output (all methods separately)
         final_output = []
+        for method, texts in results.items():
+            if texts:
+                final_output.append(f"===== Method: {method} =====\n" + "\n\n".join(texts))
+            else:
+                final_output.append(f"===== Method: {method} =====\n[No text extracted]")
         return "\n\n".join(final_output)
+    # ---------------- DOCX -----------------
     elif ext == ".docx":
         from docx import Document
         doc = Document(file.name)
         paras = [p.text for p in doc.paragraphs if p.text.strip()]
+        page_texts = []
+        page_size = 500
+        for i in range(0, len(paras), page_size):
+            page_texts.append("\n".join(paras[i:i + page_size]))
+        selected_pages = page_texts
+        if start_page and end_page:
+            selected_pages = page_texts[start_page - 1:end_page]
+        return clean_text("\n\n".join(selected_pages))
+    # ---------------- CSV -----------------
     elif ext == ".csv":
         import pandas as pd
         try:
+            return pd.read_csv(file.name).to_string(index=False)
         except Exception as e:
             logger.error(f"CSV read error: {e}")
             return "[CSV Read Error]"
+    # ---------------- Excel -----------------
     elif ext in [".xls", ".xlsx"]:
         import pandas as pd
         try:
             xl = pd.ExcelFile(file.name)
+            return "\n\n".join(
+                [f"Sheet: {s}\n{xl.parse(s).to_string(index=False)}" for s in xl.sheet_names]
+            )
         except Exception as e:
             logger.error(f"Excel read error: {e}")
             return "[Excel Read Error]"
+    # ---------------- Others -----------------
     else:
         return "[Unsupported file type]"