Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 9, 2025

Commit

0a21337

verified ·

1 Parent(s): 3982363

Update ocr_utils.py

Browse files

Files changed (1) hide show

ocr_utils.py +17 -7

ocr_utils.py CHANGED Viewed

@@ -5,11 +5,11 @@ import tempfile
 def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
     """
-    Extract text and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
     Args:
         pdf_path (str): Path to the PDF file.
     Returns:
-        list: List of dictionaries, each containing 'text' (str) and 'bbox' (list of [x0, y0, x1, y1]) for each page.
               Returns empty list if failed.
     """
     try:
@@ -32,14 +32,24 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
             # Perform OCR using EasyOCR
             results = reader.readtext(img_path)
-            text = " ".join([res[1] for res in results])  # Extract text
-            # Extract bounding boxes in [x0, y0, x1, y1] format
-            bboxes = [[res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] for res in results]
             if text.strip():
-                all_pages.append({"text": text, "bbox": bboxes})
             else:
-                all_pages.append({"text": f"Page {page_num + 1}: No text detected", "bbox": []})
             # Clean up temporary image
             if os.path.exists(img_path):

 def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
     """
+    Extract text, words, and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
     Args:
         pdf_path (str): Path to the PDF file.
     Returns:
+        list: List of dictionaries, each containing 'text' (str), 'words' (list of str), and 'bbox' (list of [x0, y0, x1, y1]) for each page.
               Returns empty list if failed.
     """
     try:
             # Perform OCR using EasyOCR
             results = reader.readtext(img_path)
+            text = " ".join([res[1] for res in results])  # Concatenated text for compatibility
+            words = []
+            bboxes = []
+            # Split text segments into words and assign bounding boxes
+            for res in results:
+                segment_text = res[1]
+                segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]]  # [x0, y0, x1, y1]
+                segment_words = segment_text.split()
+                # Assign the same bounding box to each word in the segment
+                for word in segment_words:
+                    words.append(word)
+                    bboxes.append(segment_bbox)
             if text.strip():
+                all_pages.append({"text": text, "words": words, "bbox": bboxes})
             else:
+                all_pages.append({"text": f"Page {page_num + 1}: No text detected", "words": [], "bbox": []})
             # Clean up temporary image
             if os.path.exists(img_path):