Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 5, 2025

Commit

c8b6167

verified ·

1 Parent(s): 8f3b77b

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -0

app.py CHANGED Viewed

@@ -28,10 +28,20 @@ def check_poppler():
     except FileNotFoundError:
         return False
 def extract_text_from_pdf(pdf_bytes):
     """Convert PDF to images and extract text using OCR."""
     if not check_poppler():
         return "Error: poppler-utils not installed. Install it (e.g., 'sudo apt-get install poppler-utils')."
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
         tmp.write(pdf_bytes)
         temp_path = tmp.name
@@ -40,8 +50,10 @@ def extract_text_from_pdf(pdf_bytes):
         text = ""
         for img in images:
             text += pytesseract.image_to_string(img) + "\n"
         return text
     except Exception as e:
         return f"Error extracting text: {str(e)}"
     finally:
         if os.path.exists(temp_path):
@@ -74,7 +86,9 @@ def process_contract(pdf_bytes, object_type):
     if isinstance(text, str) and text.startswith("Error"):
         return text, {}, [], "0/1"
     key_data = extract_key_data(text)
     risks = detect_risks(key_data)
     status = "✅ Processed" if not risks else "⚠️ Processed with risks"

     except FileNotFoundError:
         return False
+def check_tesseract():
+    """Check if tesseract-ocr is installed."""
+    try:
+        subprocess.run(['tesseract', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return True
+    except FileNotFoundError:
+        return False
 def extract_text_from_pdf(pdf_bytes):
     """Convert PDF to images and extract text using OCR."""
     if not check_poppler():
         return "Error: poppler-utils not installed. Install it (e.g., 'sudo apt-get install poppler-utils')."
+    if not check_tesseract():
+        return "Error: tesseract-ocr not installed. Install it (e.g., 'sudo apt-get install tesseract-ocr')."
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
         tmp.write(pdf_bytes)
         temp_path = tmp.name
         text = ""
         for img in images:
             text += pytesseract.image_to_string(img) + "\n"
+        print(f"OCR completed - Extracted text length: {len(text)}")
         return text
     except Exception as e:
+        print(f"OCR failed: {str(e)}")
         return f"Error extracting text: {str(e)}"
     finally:
         if os.path.exists(temp_path):
     if isinstance(text, str) and text.startswith("Error"):
         return text, {}, [], "0/1"
+    print(f"Extracting key data")
     key_data = extract_key_data(text)
+    print(f"Detecting risks")
     risks = detect_risks(key_data)
     status = "✅ Processed" if not risks else "⚠️ Processed with risks"