Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 5, 2025

Commit

2ae2f88

verified ·

1 Parent(s): b9ae2ff

Update ocr_utils.py

Browse files

Files changed (1) hide show

ocr_utils.py +12 -24

ocr_utils.py CHANGED Viewed

@@ -1,47 +1,35 @@
 from pdf2image import convert_from_path
 import pytesseract
-from transformers import LayoutLMv3ImageProcessor, LayoutLMv3ForTokenClassification
-from PIL import Image
-import torch
 import os
-# Load LayoutLMv3 components for OCR (optional, use if fine-tuned)
-processor = LayoutLMv3ImageProcessor(apply_ocr=True)
-model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")  # Fine-tune for OCR if needed
 def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
     """
-    Extract text from a scanned PDF using Tesseract or LayoutLMv3.
     Args:
         pdf_path (str): Path to the PDF file.
     Returns:
         str: Extracted text from all pages, or empty string if failed.
     """
     try:
-        # Convert PDF to images (one per page)
-        images = convert_from_path(pdf_path)
         all_text = []
         for i, image in enumerate(images):
-            # Try Tesseract first
             text = pytesseract.image_to_string(image)
             if text.strip():
                 all_text.append(f"Page {i+1}:\n{text}")
             else:
-                # Fall back to LayoutLMv3 if Tesseract fails (simplified)
-                encoding = processor(images=[image], return_tensors="pt")
-                input_ids = encoding["input_ids"]
-                attention_mask = encoding["attention_mask"]
-                with torch.no_grad():
-                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
-                    predictions = torch.argmax(outputs.logits, dim=2)
-                    tokens = processor.tokenizer.convert_ids_to_tokens(input_ids[0])
-                    labels = predictions[0].tolist()
-                    page_text = " ".join([tokens[i] for i, label in enumerate(labels) if label > 0])  # Adjust label logic
-                    all_text.append(f"Page {i+1} (LayoutLMv3):\n{page_text}")
         return "\n".join(all_text) if all_text else ""
     except Exception as e:
         print(f"OCR failed: {str(e)}")
-        return ""

 from pdf2image import convert_from_path
 import pytesseract
 import os
+import tempfile
 def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
     """
+    Extract text from a scanned PDF using Tesseract.
     Args:
         pdf_path (str): Path to the PDF file.
     Returns:
         str: Extracted text from all pages, or empty string if failed.
     """
     try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+            with open(pdf_path, 'rb') as f:
+                tmp.write(f.read())
+            temp_path = tmp.name
+        images = convert_from_path(temp_path)
         all_text = []
         for i, image in enumerate(images):
             text = pytesseract.image_to_string(image)
             if text.strip():
                 all_text.append(f"Page {i+1}:\n{text}")
             else:
+                all_text.append(f"Page {i+1}: No text detected")
         return "\n".join(all_text) if all_text else ""
     except Exception as e:
         print(f"OCR failed: {str(e)}")
+        return ""
+    finally:
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)