Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 9, 2025

Commit

27937fa

verified ·

1 Parent(s): 120db3b

Update ocr_utils.py

Browse files

Files changed (1) hide show

ocr_utils.py +13 -9

ocr_utils.py CHANGED Viewed

@@ -3,13 +3,14 @@ import easyocr
 import os
 import tempfile
-def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
     """
-    Extract text from a scanned PDF using PyMuPDF and EasyOCR.
     Args:
         pdf_path (str): Path to the PDF file.
     Returns:
-        str: Extracted text from all pages, or empty string if failed.
     """
     try:
         # Save PDF to a temporary file
@@ -20,7 +21,7 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
         # Convert PDF to images using PyMuPDF
         doc = fitz.open(temp_path)
-        all_text = []
         reader = easyocr.Reader(['en'], gpu=False)  # Initialize EasyOCR, adjust languages as needed
         for page_num in range(len(doc)):
@@ -31,21 +32,24 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
             # Perform OCR using EasyOCR
             results = reader.readtext(img_path)
-            text = " ".join([res[1] for res in results])  # Extract text from results
             if text.strip():
-                all_text.append(f"Page {page_num + 1}:\n{text}")
             else:
-                all_text.append(f"Page {page_num + 1}: No text detected")
             # Clean up temporary image
             if os.path.exists(img_path):
                 os.unlink(img_path)
         doc.close()
-        return "\n".join(all_text) if all_text else ""
     except Exception as e:
         print(f"OCR failed: {str(e)}")
-        return ""
     finally:
         if os.path.exists(temp_path):
             os.unlink(temp_path)

 import os
 import tempfile
+def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
     """
+    Extract text and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
     Args:
         pdf_path (str): Path to the PDF file.
     Returns:
+        list: List of dictionaries, each containing 'text' (str) and 'bbox' (list of [x0, y0, x1, y1]) for each page.
+              Returns empty list if failed.
     """
     try:
         # Save PDF to a temporary file
         # Convert PDF to images using PyMuPDF
         doc = fitz.open(temp_path)
+        all_pages = []
         reader = easyocr.Reader(['en'], gpu=False)  # Initialize EasyOCR, adjust languages as needed
         for page_num in range(len(doc)):
             # Perform OCR using EasyOCR
             results = reader.readtext(img_path)
+            text = " ".join([res[1] for res in results])  # Extract text
+            # Extract bounding boxes in [x0, y0, x1, y1] format
+            bboxes = [[res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] for res in results]
             if text.strip():
+                all_pages.append({"text": text, "bbox": bboxes})
             else:
+                all_pages.append({"text": f"Page {page_num + 1}: No text detected", "bbox": []})
             # Clean up temporary image
             if os.path.exists(img_path):
                 os.unlink(img_path)
         doc.close()
+        return all_pages
     except Exception as e:
         print(f"OCR failed: {str(e)}")
+        return []
     finally:
         if os.path.exists(temp_path):
             os.unlink(temp_path)