Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 9, 2025

Commit

ee4c8aa

verified ·

1 Parent(s): 2970e4e

Update ocr_utils.py

Browse files

Files changed (1) hide show

ocr_utils.py +24 -8

ocr_utils.py CHANGED Viewed

@@ -1,31 +1,47 @@
-from pdf2image import convert_from_path
-import pytesseract
 import os
 import tempfile
 def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
     """
-    Extract text from a scanned PDF using Tesseract.
     Args:
         pdf_path (str): Path to the PDF file.
     Returns:
         str: Extracted text from all pages, or empty string if failed.
     """
     try:
         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
             with open(pdf_path, 'rb') as f:
                 tmp.write(f.read())
             temp_path = tmp.name
-        images = convert_from_path(temp_path)
         all_text = []
-        for i, image in enumerate(images):
-            text = pytesseract.image_to_string(image)
             if text.strip():
-                all_text.append(f"Page {i+1}:\n{text}")
             else:
-                all_text.append(f"Page {i+1}: No text detected")
         return "\n".join(all_text) if all_text else ""
     except Exception as e:
         print(f"OCR failed: {str(e)}")

+import fitz  # PyMuPDF
+import easyocr
 import os
 import tempfile
 def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
     """
+    Extract text from a scanned PDF using PyMuPDF and EasyOCR.
     Args:
         pdf_path (str): Path to the PDF file.
     Returns:
         str: Extracted text from all pages, or empty string if failed.
     """
     try:
+        # Save PDF to a temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
             with open(pdf_path, 'rb') as f:
                 tmp.write(f.read())
             temp_path = tmp.name
+        # Convert PDF to images using PyMuPDF
+        doc = fitz.open(temp_path)
         all_text = []
+        reader = easyocr.Reader(['en'], gpu=False)  # Initialize EasyOCR, adjust languages as needed
+        for page_num in range(len(doc)):
+            page = doc[page_num]
+            pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # 300 DPI
+            img_path = f"{temp_path}_page_{page_num}.png"
+            pix.save(img_path)
+            # Perform OCR using EasyOCR
+            results = reader.readtext(img_path)
+            text = " ".join([res[1] for res in results])  # Extract text from results
             if text.strip():
+                all_text.append(f"Page {page_num + 1}:\n{text}")
             else:
+                all_text.append(f"Page {page_num + 1}: No text detected")
+            # Clean up temporary image
+            if os.path.exists(img_path):
+                os.unlink(img_path)
+        doc.close()
         return "\n".join(all_text) if all_text else ""
     except Exception as e:
         print(f"OCR failed: {str(e)}")