Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 9, 2025

Commit

c842ecc

verified ·

1 Parent(s): 0407128

Update ocr_utils.py

Browse files

Files changed (1) hide show

ocr_utils.py +30 -7

ocr_utils.py CHANGED Viewed

@@ -5,11 +5,12 @@ import tempfile
 def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
     """
-    Extract text, words, and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
     Args:
         pdf_path (str): Path to the PDF file.
     Returns:
-        list: List of dictionaries, each containing 'text' (str), 'words' (list of str), and 'bbox' (list of [x0, y0, x1, y1]) for each page.
               Returns empty list if failed.
     """
     try:
@@ -30,26 +31,48 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
             img_path = f"{temp_path}_page_{page_num}.png"
             pix.save(img_path)
             # Perform OCR using EasyOCR
             results = reader.readtext(img_path)
             text = " ".join([res[1] for res in results])  # Concatenated text for compatibility
             words = []
             bboxes = []
-            # Split text segments into words and assign bounding boxes
             for res in results:
                 segment_text = res[1]
                 segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]]  # [x0, y0, x1, y1]
                 segment_words = segment_text.split()
-                # Assign the same bounding box to each word in the segment
                 for word in segment_words:
                     words.append(word)
-                    bboxes.append(segment_bbox)
             if text.strip():
-                all_pages.append({"text": text, "words": words, "bbox": bboxes})
             else:
-                all_pages.append({"text": f"Page {page_num + 1}: No text detected", "words": [], "bbox": []})
             # Clean up temporary image
             if os.path.exists(img_path):

 def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
     """
+    Extract text, words, and normalized bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
     Args:
         pdf_path (str): Path to the PDF file.
     Returns:
+        list: List of dictionaries, each containing 'text' (str), 'words' (list of str),
+              'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) for each page.
               Returns empty list if failed.
     """
     try:
             img_path = f"{temp_path}_page_{page_num}.png"
             pix.save(img_path)
+            # Get image dimensions
+            image_width, image_height = pix.width, pix.height
             # Perform OCR using EasyOCR
             results = reader.readtext(img_path)
             text = " ".join([res[1] for res in results])  # Concatenated text for compatibility
             words = []
             bboxes = []
+            # Split text segments into words and assign normalized bounding boxes
             for res in results:
                 segment_text = res[1]
                 segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]]  # [x0, y0, x1, y1]
+                # Normalize bounding box to 0-1000 range
+                normalized_bbox = [
+                    int((segment_bbox[0] / image_width) * 1000),
+                    int((segment_bbox[1] / image_height) * 1000),
+                    int((segment_bbox[2] / image_width) * 1000),
+                    int((segment_bbox[3] / image_height) * 1000)
+                ]
+                # Ensure coordinates are within 0-1000
+                normalized_bbox = [max(0, min(1000, coord)) for coord in normalized_bbox]
                 segment_words = segment_text.split()
+                # Assign the same normalized bounding box to each word in the segment
                 for word in segment_words:
                     words.append(word)
+                    bboxes.append(normalized_bbox)
             if text.strip():
+                all_pages.append({
+                    "text": text,
+                    "words": words,
+                    "bbox": bboxes,
+                    "image_dims": [image_width, image_height]
+                })
             else:
+                all_pages.append({
+                    "text": f"Page {page_num + 1}: No text detected",
+                    "words": [],
+                    "bbox": [],
+                    "image_dims": [image_width, image_height]
+                })
             # Clean up temporary image
             if os.path.exists(img_path):