Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 9, 2025

Commit

9f54a59

verified ·

1 Parent(s): ec09821

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -97

app.py CHANGED Viewed

@@ -1,109 +1,166 @@
-from transformers import LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification, LayoutLMv3ImageProcessor
 import torch
 from PIL import Image
-import fitz  # PyMuPDF
-from typing import Dict, List
 import os
-from huggingface_hub import login
 import re
-# Optional: Log in to Hugging Face if using a private model
-# login(token="your_hf_token")
-# Load pre-trained LayoutLMv3 models
 tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
-feature_extractor = LayoutLMv3ImageProcessor(apply_ocr=False)
 model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
-def extract_key_values_with_layoutlm(text_data: str, pdf_path: str) -> Dict[str, str]:
-    """
-    Extract key-value pairs from PDF text using LayoutLMv3-base or fallback to regex.
-    Args:
-        text_data (str): Extracted text from PDF.
-        pdf_path (str): Path to the PDF file.
-    Returns:
-        dict: Key-value pairs extracted from the document.
-    """
     try:
-        # Fallback to regex if model is untrained
-        key_values = {}
-        dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text_data)
-        amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text_data)
-        if dates or amounts:
-            key_values.update({"Date bangs": dates[0] if dates else "", "Amount": amounts[0] if amounts else ""})
-        # Attempt LayoutLMv3 processing
-        doc = fit ভাz.open(pdf_path)
-        for page_num in range(len(doc)):
-            page = doc[page_num]
-            pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # 300 DPI
-            img_path = f"{pdf_path}_page_{page_num}.png"
-            pix.save(img_path)
-            image = Image.open(img_path)
-            encoding = feature_extractor(images=[image], text=text_data.splitlines(), return_tensors="pt")
-            input_ids = encoding["input_ids"]
-            attention_mask = encoding["attention_mask"]
-            with torch.no_grad():
-                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
-                predictions = torch.argmax(outputs.logits, dim=2)
-            tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
-            labels = predictions[0].tolist()
-            current_key = None
-            current_value = []
-            for token, label in zip(tokens, labels):
-                if label == 1:  # Key start (adjust based on training)
-                    if current_key and current_value:
-                        key_values[current_key] = " ".join(current_value).strip()
-                    current_key = token
-                    current_value = []
-                elif label == 2 and current_key:  # Value (adjust based on training)
-                    current_value.append(token)
-            if current_key and current_value:
-                key_values[current_key] = " ".join(current_value).strip()
-            # Clean up temporary image
-            if os.path.exists(img_path):
-                os.unlink(img_path)
-        doc.close()
-        return key_values if key_values else {"status": "failed", "error": "No key-value pairs extracted", "key_values": {}}
-    except Exception as e:
-        return {"status": "failed", "error": str(e), "key_values": {}}
-def run_ai_mapping_with_layoutlm(key_values: Dict[str, str], object_field_names: List[str], pdf_path: str) -> Dict:
-    """
-    Map extracted key-values to object fields using LayoutLMv3-base (simplified).
-    Args:
-        key_values (dict): Extracted key-value pairs.
-        object_field_names (list): List of object field names.
-        pdf_path (str): Path to the PDF file (for context if needed).
-    Returns:
-        dict: Mapping results with status, mappings, unmapped fields, and error (if any).
-    """
     try:
-        mappings = {}
-        unmapped_fields = object_field_names.copy()
-        for field in object_field_names:
-            for key, value in key_values.items():
-                if field.lower() in key.lower() or any(k.lower() in field.lower() for k in key_values.keys()):
-                    mappings[field] = value
-                    unmapped_fields.remove(field)
-                    break
-        return {
-            "status": "success",
-            "mappings": mappings,
-            "unmapped_fields": unmapped_fields,
-            "error": None
-        }
     except Exception as e:
-        return {
-            "status": "failed",
-            "error": str(e),
-            "mappings": {},
-            "unmapped_fields": object_field_names
-        }

+import gradio as gr
+from transformers import LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification
 import torch
 from PIL import Image
 import os
+import tempfile
+from tqdm import tqdm
+import subprocess
 import re
+from ai_mapping import extract_key_values_with_layoutlm, run_ai_mapping_with_layoutlm
+from ocr_utils import extract_text_from_pdf_with_tesseract_or_layoutlm
+from salesforce_utils import get_token, create_or_update_record
+# Initialize global state
+contract_data = {}  # In-memory contract repository
+processed_files = 0
+total_files = 0
+# Load pre-trained LayoutLMv3 model and tokenizer (placeholder for future fine-tuning)
 tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
 model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
+def check_poppler():
+    """Check if poppler-utils is installed."""
     try:
+        subprocess.run(['pdftoppm', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return True
+    except FileNotFoundError:
+        return False
+def check_tesseract():
+    """Check if tesseract-ocr is installed."""
+    try:
+        subprocess.run(['tesseract', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        return True
+    except FileNotFoundError:
+        return False
+def save_temp_file(pdf_bytes):
+    """Save PDF bytes to a temporary file and return the path."""
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        tmp.write(pdf_bytes)
+        return tmp.name
+def detect_risks(data):
+    """Detect risks (e.g., missing dates, large amounts)."""
+    risks = []
+    if not data.get("dates", []):
+        risks.append("No expiration date detected - potential obligation risk.")
+    if any(float(amount.replace('$', '').replace(',', '')) > 1000000 for amount in data.get("amounts", [])):
+        risks.append("Large amount detected - review for financial risk.")
+    return risks
+def process_contract(pdf_bytes, object_type):
+    """Process contract and simulate CCI workflow."""
+    global processed_files, total_files
+    total_files = 1
+    processed_files = 0
+    print("Received file - Starting processing")
+    if not check_poppler() or not check_tesseract():
+        error_msg = "Error: Required dependencies missing. Install poppler-utils (e.g., 'sudo apt-get install poppler-utils') and tesseract-ocr (e.g., 'sudo apt-get install tesseract-ocr')."
+        print(error_msg)
+        return error_msg, {}, [], "0/1"
+    temp_path = save_temp_file(pdf_bytes)
+    print(f"Temporary file created at: {temp_path}")
+    text = extract_text_from_pdf_with_tesseract_or_layoutlm(temp_path)
+    print(f"OCR result length: {len(text)}")
+    if isinstance(text, str) and not text.strip():
+        os.unlink(temp_path)
+        print("No text extracted from PDF.")
+        return "❌ No text extracted from PDF.", {}, [], "0/1"
+    print("Extracting key data")
+    key_data = extract_key_values_with_layoutlm(text, temp_path)
+    print(f"Key data extracted: {key_data}")
+    if "status" in key_data and key_data["status"] == "failed":
+        os.unlink(temp_path)
+        print(f"Extraction failed: {key_data.get('error', 'Unknown error')}")
+        return f"❌ Extraction failed: {key_data.get('error', 'Unknown error')}", {}, [], "0/1"
+    print("Detecting risks")
+    risks = detect_risks(key_data)
+    print(f"Detected risks: {risks}")
+    status = "✅ Processed" if not risks else "⚠️ Processed with risks"
+    # Mock CLM fields with Salesforce-ready structure
+    clm_fields = {"Name": f"Contract_{len(contract_data) + 1}", "Type__c": object_type, "Status__c": status}
+    clm_fields.update({k: v for k, v in key_data.items() if k not in ["status", "error", "key_values"]})
+    # Optional Salesforce sync
     try:
+        token, instance_url = get_token()
+        sf_response = create_or_update_record(f"{object_type}__c", clm_fields, token, instance_url)
+        if "error" in sf_response:
+            print(f"Salesforce sync failed: {sf_response['error']}")
+        else:
+            print(f"Salesforce sync successful: {sf_response}")
     except Exception as e:
+        print(f"Salesforce sync error: {str(e)}")
+    contract_id = f"Contract_{len(contract_data) + 1}"
+    contract_data[contract_id] = {
+        "data": key_data,
+        "risks": risks,
+        "clm_fields": clm_fields,
+        "status": status
+    }
+    processed_files = 1
+    progress = "1/1"
+    print(f"Processing completed - ID: {contract_id}, Progress: {progress}")
+    os.unlink(temp_path)
+    return status, key_data, risks, progress
+def search_contracts(query):
+    """Search contract repository."""
+    results = {cid: data for cid, data in contract_data.items() if query.lower() in str(data).lower()}
+    return results if results else {"No matches": "No contracts found matching the query."}
+# Gradio UI
+with gr.Blocks(title="Contract Intelligence App") as demo:
+    with gr.Row():
+        file_input = gr.File(type="binary", file_types=["pdf"], file_count="multiple", label="Upload Contracts")
+        upload_progress = gr.Textbox(label="Progress", value="0/0", interactive=False)
+    object_type = gr.Dropdown(choices=["Contract", "Agreement", "Invoice"], label="Select Object Type")
+    process_button = gr.Button("Process Contracts")
+    status_output = gr.Textbox(label="Status", interactive=False)
+    extracted_data_output = gr.JSON(label="Extracted Data")
+    risks_output = gr.Textbox(label="Detected Risks", interactive=False)
+    def process_and_display(files, obj_type):
+        if not files:
+            return "❌ No files uploaded.", {}, "No risks detected", gr.update(value="0/0")
+        results = []
+        all_data = {}
+        all_risks = []
+        for i, file in enumerate(files):
+            status, data, risks, _ = process_contract(file, obj_type)
+            results.append(f"{status} - File: File_{i}")
+            all_data.update({f"File_{i}": data})
+            all_risks.extend(risks)
+        progress = f"{len(files)}/{len(files)}"
+        return "\n".join(results), all_data, "\n".join(all_risks) if all_risks else "No risks detected", gr.update(value=progress)
+    process_button.click(
+        fn=process_and_display,
+        inputs=[file_input, object_type],
+        outputs=[status_output, extracted_data_output, risks_output, upload_progress]
+    )
+    with gr.Tab("Contract Repository"):
+        search_query = gr.Textbox(label="Search Contracts", placeholder="Enter keyword...")
+        search_results = gr.JSON(label="Search Results")
+        search_button = gr.Button("Search")
+        search_button.click(
+            fn=search_contracts,
+            inputs=search_query,
+            outputs=search_results
+        )
+demo.launch()