Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 15, 2025

Commit

4968aaa

verified ·

1 Parent(s): afc39b5

Update ocr_utils.py

Browse files

Files changed (1) hide show

ocr_utils.py +15 -2

ocr_utils.py CHANGED Viewed

@@ -2,6 +2,11 @@ import fitz  # PyMuPDF
 import easyocr
 import os
 import tempfile
 def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
     """
@@ -19,23 +24,30 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
             with open(pdf_path, 'rb') as f:
                 tmp.write(f.read())
             temp_path = tmp.name
         # Convert PDF to images using PyMuPDF
         doc = fitz.open(temp_path)
         all_pages = []
         reader = easyocr.Reader(['en'], gpu=False)  # Initialize EasyOCR, adjust languages as needed
         for page_num in range(len(doc)):
             page = doc[page_num]
-            pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # 300 DPI
             img_path = f"{temp_path}_page_{page_num}.png"
             pix.save(img_path)
             # Get image dimensions
             image_width, image_height = pix.width, pix.height
             # Perform OCR using EasyOCR
             results = reader.readtext(img_path)
             text = " ".join([res[1] for res in results])  # Concatenated text for compatibility
             words = []
             bboxes = []
@@ -79,9 +91,10 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
                 os.unlink(img_path)
         doc.close()
         return all_pages
     except Exception as e:
-        print(f"OCR failed: {str(e)}")
         return []
     finally:
         if os.path.exists(temp_path):

 import easyocr
 import os
 import tempfile
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
     """
             with open(pdf_path, 'rb') as f:
                 tmp.write(f.read())
             temp_path = tmp.name
+        logger.info(f"Temporary PDF created at: {temp_path}")
         # Convert PDF to images using PyMuPDF
         doc = fitz.open(temp_path)
+        if not doc.page_count:
+            logger.error(f"PDF is empty or unreadable: {pdf_path}")
+            return []
         all_pages = []
         reader = easyocr.Reader(['en'], gpu=False)  # Initialize EasyOCR, adjust languages as needed
         for page_num in range(len(doc)):
             page = doc[page_num]
+            pix = page.get_pixmap(matrix=fitz.Matrix(400/72, 400/72))  # Increased DPI to 400 for better detection
             img_path = f"{temp_path}_page_{page_num}.png"
             pix.save(img_path)
+            logger.info(f"Image generated at: {img_path}, Dimensions: {pix.width}x{pix.height}")
             # Get image dimensions
             image_width, image_height = pix.width, pix.height
             # Perform OCR using EasyOCR
             results = reader.readtext(img_path)
+            if not results:
+                logger.warning(f"No text detected on page {page_num + 1}")
             text = " ".join([res[1] for res in results])  # Concatenated text for compatibility
             words = []
             bboxes = []
                 os.unlink(img_path)
         doc.close()
+        logger.info(f"Extracted data from {len(all_pages)} pages")
         return all_pages
     except Exception as e:
+        logger.error(f"OCR failed: {str(e)}")
         return []
     finally:
         if os.path.exists(temp_path):