Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 4, 2025

Commit

ab6b7bb

verified ·

1 Parent(s): 5f8a311

Update ocr_utils.py

Browse files

Files changed (1) hide show

ocr_utils.py +26 -29

ocr_utils.py CHANGED Viewed

@@ -13,59 +13,56 @@ def check_poppler_installed():
     """Check if Poppler's pdfinfo is installed and in PATH."""
     import subprocess
     try:
-        subprocess.run(['pdfinfo', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
         return True
     except (subprocess.CalledProcessError, FileNotFoundError):
         return False
 def extract_text_from_pdf(pdf_path):
     """
     Extract text from a PDF file using pdf2image and pytesseract for OCR.
-    Fallback to pdfplumber if Poppler is not installed or pdf2image fails.
     """
     # Validate PDF path
     if not os.path.exists(pdf_path):
         raise FileNotFoundError(f"❌ PDF file not found: {pdf_path}")
     # Try pdf2image with Poppler for OCR (best for scanned PDFs)
     if check_poppler_installed():
         try:
             with tempfile.TemporaryDirectory() as tempdir:
                 images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
-                all_text = []
-                for img in images:
-                    text = pytesseract.image_to_string(img)
-                    all_text.append(text)
-                extracted_text = "\n".join(all_text).strip()
-                if not extracted_text:
-                    print("⚠️ No text extracted with pdf2image. The PDF may be empty or OCR failed.")
-                return extracted_text
         except PDFInfoNotInstalledError:
             print("❌ Poppler not installed or not in PATH. Falling back to pdfplumber.")
         except Exception as e:
             print(f"❌ Error with pdf2image: {str(e)}. Falling back to pdfplumber.")
-    else:
-        print("❌ Poppler (pdfinfo) not found. Install it with: sudo apt-get install poppler-utils")
-        print("Falling back to pdfplumber for text extraction.")
     # Fallback to pdfplumber if pdf2image fails or Poppler is not installed
     if not PDFPLUMBER_AVAILABLE:
         raise ImportError(
             "❌ pdfplumber not installed and Poppler is unavailable. "
             "Install pdfplumber with: pip install pdfplumber\n"
-            "Or install Poppler with: sudo apt-get install poppler-utils"
-        )
-    try:
-        with pdfplumber.open(pdf_path) as pdf:
-            all_text = []
-            for page in pdf.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    all_text.append(page_text)
-            extracted_text = "\n".join(all_text).strip()
-            if not extracted_text:
-                print("⚠️ No text extracted with pdfplumber. The PDF may be scanned or empty.")
-            return extracted_text
-    except Exception as e:
-        raise Exception(f"❌ Failed to extract text with pdfplumber: {str(e)}")

     """Check if Poppler's pdfinfo is installed and in PATH."""
     import subprocess
     try:
+        result = subprocess.run(['pdfinfo', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
+        print(f"✅ Poppler found: {result.stdout.decode().strip()}")
         return True
     except (subprocess.CalledProcessError, FileNotFoundError):
+        print("❌ Poppler (pdfinfo) not found. Install it with: sudo apt-get install poppler-utils")
         return False
 def extract_text_from_pdf(pdf_path):
     """
     Extract text from a PDF file using pdf2image and pytesseract for OCR.
+    Fallback to pdfplumber for text-based PDFs. Returns structured JSON per page.
     """
+    print(f"Processing PDF: {pdf_path}")
     # Validate PDF path
     if not os.path.exists(pdf_path):
         raise FileNotFoundError(f"❌ PDF file not found: {pdf_path}")
+    # Check for tesseract-ocr
+    try:
+        pytesseract.get_tesseract_version()
+        print("✅ Tesseract OCR found")
+    except pytesseract.TesseractNotFoundError:
+        raise Exception("❌ Tesseract OCR not found. Install it with: sudo apt-get install tesseract-ocr")
     # Try pdf2image with Poppler for OCR (best for scanned PDFs)
+    result = {"pages": [], "status": "success", "error": None}
     if check_poppler_installed():
         try:
             with tempfile.TemporaryDirectory() as tempdir:
+                print(f"Converting PDF to images in temp directory: {tempdir}")
                 images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
+                print(f"Converted {len(images)} pages to images")
+                for i, img in enumerate(images):
+                    text = pytesseract.image_to_string(img).strip()
+                    print(f"Extracted text from page {i+1}: {text[:50]}...")
+                    result["pages"].append({"page_number": i+1, "text": text})
+                if not result["pages"]:
+                    result["status"] = "failed"
+                    result["error"] = "No text extracted with pdf2image. The PDF may be empty or OCR failed."
+                return result
         except PDFInfoNotInstalledError:
             print("❌ Poppler not installed or not in PATH. Falling back to pdfplumber.")
         except Exception as e:
             print(f"❌ Error with pdf2image: {str(e)}. Falling back to pdfplumber.")
+            result["status"] = "failed"
+            result["error"] = str(e)
     # Fallback to pdfplumber if pdf2image fails or Poppler is not installed
     if not PDFPLUMBER_AVAILABLE:
         raise ImportError(
             "❌ pdfplumber not installed and Poppler is unavailable. "
             "Install pdfplumber with: pip install pdfplumber\n"
+            Long context detected, continuing in next response...