Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 4, 2025

Commit

608a057

verified ·

1 Parent(s): 061b9a2

Update ocr_utils.py

Browse files

Files changed (1) hide show

ocr_utils.py +67 -8

ocr_utils.py CHANGED Viewed

@@ -1,12 +1,71 @@
-from pdf2image import convert_from_path
 import pytesseract
 import tempfile
 def extract_text_from_pdf(pdf_path):
-    with tempfile.TemporaryDirectory() as tempdir:
-        images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
-        all_text = []
-        for img in images:
-            text = pytesseract.image_to_string(img)
-            all_text.append(text)
-        return "\n".join(all_text)

 import pytesseract
 import tempfile
+import os
+from pdf2image import convert_from_path
+from pdf2image.exceptions import PDFInfoNotInstalledError
+try:
+    import pdfplumber
+    PDFPLUMBER_AVAILABLE = True
+except ImportError:
+    PDFPLUMBER_AVAILABLE = False
+def check_poppler_installed():
+    """Check if Poppler's pdfinfo is installed and in PATH."""
+    import subprocess
+    try:
+        subprocess.run(['pdfinfo', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+        return True
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return False
 def extract_text_from_pdf(pdf_path):
+    """
+    Extract text from a PDF file using pdf2image and pytesseract for OCR.
+    Fallback to pdfplumber if Poppler is not installed or pdf2image fails.
+    """
+    # Validate PDF path
+    if not os.path.exists(pdf_path):
+        raise FileNotFoundError(f"❌ PDF file not found: {pdf_path}")
+    # Try pdf2image with Poppler for OCR (best for scanned PDFs)
+    if check_poppler_installed():
+        try:
+            with tempfile.TemporaryDirectory() as tempdir:
+                images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
+                all_text = []
+                for img in images:
+                    text = pytesseract.image_to_string(img)
+                    all_text.append(text)
+                extracted_text = "\n".join(all_text).strip()
+                if not extracted_text:
+                    print("⚠️ No text extracted with pdf2image. The PDF may be empty or OCR failed.")
+                return extracted_text
+        except PDFInfoNotInstalledError:
+            print("❌ Poppler not installed or not in PATH. Falling back to pdfplumber.")
+        except Exception as e:
+            print(f"❌ Error with pdf2image: {str(e)}. Falling back to pdfplumber.")
+    else:
+        print("❌ Poppler (pdfinfo) not found. Install it with: sudo apt-get install poppler-utils")
+        print("Falling back to pdfplumber for text extraction.")
+    # Fallback to pdfplumber if pdf2image fails or Poppler is not installed
+    if not PDFPLUMBER_AVAILABLE:
+        raise ImportError(
+            "❌ pdfplumber not installed and Poppler is unavailable. "
+            "Install pdfplumber with: pip install pdfplumber\n"
+            "Or install Poppler with: sudo apt-get install poppler-utils"
+        )
+    try:
+        with pdfplumber.open(pdf_path) as pdf:
+            all_text = []
+            for page in pdf.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    all_text.append(page_text)
+            extracted_text = "\n".join(all_text).strip()
+            if not extracted_text:
+                print("⚠️ No text extracted with pdfplumber. The PDF may be scanned or empty.")
+            return extracted_text
+    except Exception as e:
+        raise Exception(f"❌ Failed to extract text with pdfplumber: {str(e)}")