Spaces:

pavansuresh
/

SmartContractMigrator

Sleeping

App Files Files Community

pavansuresh commited on Jul 5, 2025

Commit

8f3b77b

verified ·

1 Parent(s): 9230bf8

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -178

app.py CHANGED Viewed

@@ -1,210 +1,142 @@
 import gradio as gr
-from ai_mapping import run_ai_mapping_with_layoutlm, extract_key_values_with_layoutlm
-from ocr_utils import extract_text_from_pdf_with_tesseract_or_layoutlm
 import os
 import tempfile
 from tqdm import tqdm
 import subprocess
-# Initialize global state for failed records and uploaded files
 failed_records = []
-uploaded_file_details = {}
-def is_pdf_file(file_bytes):
-    """Check if the file is a valid PDF by reading the header from bytes."""
-    valid_pdf_header = b'%PDF-'
-    return file_bytes.startswith(valid_pdf_header) if file_bytes else False
 def check_poppler():
-    """Check if poppler-utils is installed and in PATH."""
     try:
         subprocess.run(['pdftoppm', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         return True
     except FileNotFoundError:
         return False
-def save_failed_record(pdf_name, object_name, error, mappings):
-    """Log failed records for reconciliation."""
-    global failed_records
-    failed_records.append({
-        "pdf_name": pdf_name,
-        "object_name": object_name,
-        "error": error,
-        "mappings": mappings
-    })
-def save_uploaded_file_details(pdf_name, temp_path):
-    """Store file details securely."""
-    global uploaded_file_details
-    uploaded_file_details[pdf_name] = {"temp_path": temp_path, "processed": False}
-def process_contract(uploaded_files, object_name, manual_mappings):
-    """Process uploaded PDFs locally with mock Salesforce object fields."""
-    if not uploaded_files:
-        return "❌ No files uploaded.", None, failed_records, "0/0"
-    # Debug: Log uploaded files and their raw data
-    print(f"Received files (bytes): {len(uploaded_files)} files at {len(uploaded_files)}")
-    for i, file_bytes in enumerate(uploaded_files):
-        print(f"File {i} header: {file_bytes[:5]} - Starting processing")
-    # Check for poppler-utils
     if not check_poppler():
-        return "❌ Error: poppler-utils is not installed or not in PATH. Please install it (e.g., 'sudo apt-get install poppler-utils' on Linux).", None, failed_records, "0/0"
-    # Mock Salesforce object fields
-    mock_object_fields = ["Name", "Description", "Amount", "Date"] if object_name else []
-    total_files = len(uploaded_files)
     processed_files = 0
-    results = []
-    ai_result = None
-    with tqdm(total=total_files, desc="Processing PDFs") as pbar:
-        for i, file_bytes in enumerate(uploaded_files):
-            pdf_name = f"uploaded_file_{i}.pdf"
-            if not is_pdf_file(file_bytes):
-                save_failed_record(pdf_name, object_name, "Invalid PDF content", {})
-                results.append(f"❌ {pdf_name}: Invalid PDF content")
-                processed_files += 1
-                pbar.update(1)
-                continue
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
-                tmp.write(file_bytes)
-                temp_path = tmp.name
-            save_uploaded_file_details(pdf_name, temp_path)
-            try:
-                print(f"Processing {pdf_name} - OCR stage")
-                text_data = extract_text_from_pdf_with_tesseract_or_layoutlm(temp_path)
-                if not text_data:
-                    save_failed_record(pdf_name, object_name, "No text extracted from PDF", {})
-                    results.append(f"⚠️ {pdf_name}: No text extracted")
-                    processed_files += 1
-                    pbar.update(1)
-                    continue
-                print(f"Processing {pdf_name} - AI mapping stage")
-                key_values = extract_key_values_with_layoutlm(text_data, temp_path)
-                ai_result = run_ai_mapping_with_layoutlm(key_values, mock_object_fields, temp_path)
-                if ai_result['status'] == 'failed':
-                    save_failed_record(pdf_name, object_name, ai_result['error'], ai_result['mappings'])
-                    results.append(f"❌ {pdf_name}: {ai_result['error']}")
-                    processed_files += 1
-                    pbar.update(1)
-                    continue
-                mappings = {k: v for k, v in ai_result['mappings'].items()}
-                for field, value in manual_mappings.items():
-                    if value and field in mock_object_fields:
-                        mappings[field] = value
-                results.append(f"✅ {pdf_name}: Data processed locally (Mock ID: {hash(pdf_name)})")
-                processed_files += 1
-                pbar.update(1)
-            except Exception as e:
-                save_failed_record(pdf_name, object_name, str(e), {})
-                results.append(f"❌ {pdf_name}: {str(e)}")
-                processed_files += 1
-                pbar.update(1)
-            finally:
-                if os.path.exists(temp_path):
-                    os.unlink(temp_path)
-                uploaded_file_details[pdf_name]["processed"] = True
-    progress = f"{processed_files}/{total_files}"
-    print(f"Processing completed - Results: {results}, Progress: {progress}")
-    return "\n".join(results), ai_result, failed_records, progress
-def retry_failed_record(index, object_name, manual_mappings):
-    """Retry a failed record with manual corrections."""
-    global failed_records, uploaded_file_details
-    if 0 <= index < len(failed_records):
-        failed_record = failed_records.pop(index)
-        pdf_name = failed_record['pdf_name']
-        temp_path = uploaded_file_details.get(pdf_name, {}).get("temp_path")
-        if temp_path and os.path.exists(temp_path):
-            with open(temp_path, 'rb') as f:
-                result, ai_result, updated_records, progress = process_contract([f.read()], object_name, manual_mappings)
-            failed_records = updated_records
-            return result, updated_records, progress
-        return "❌ File not found for retry.", failed_records, "0/1"
-    return "❌ Invalid record index.", failed_records, "0/1"
 # Gradio UI
-with gr.Blocks(title="Smart Contract Migrator (Local Mode)") as demo:
     with gr.Row():
-        uploaded_files = gr.File(type="binary", file_types=["pdf"], file_count="multiple", label="Upload Contract PDFs")
-        upload_progress = gr.Textbox(label="Upload Progress", value="0/0", interactive=False)
-    object_name = gr.Dropdown(choices=["Contract", "Invoice", "Agreement"], label="Select Object Type (Mock)")
-    def update_fields(selected_object):
-        if selected_object:
-            mock_fields = ["Name", "Description", "Amount", "Date"]
-            return gr.update(visible=True, value="\n".join(mock_fields))
-        return gr.update(visible=False)
-    object_fields_output = gr.Textbox(label="Available Fields (Mock)", interactive=False)
-    object_name.change(fn=update_fields, inputs=object_name, outputs=object_fields_output)
-    manual_mapping_inputs = gr.State(value={})
-    def update_manual_mappings(selected_object):
-        if selected_object:
-            mock_fields = ["Name", "Description", "Amount", "Date"]
-            mapping_inputs = {field: gr.Textbox(label=f"{field} (Manual Correction)", interactive=True, value="") for field in mock_fields}
-            return mapping_inputs
-        return {}
-    object_name.change(
-        fn=update_manual_mappings,
-        inputs=object_name,
-        outputs=manual_mapping_inputs
-    )
-    process_button = gr.Button("Extract, Map, and Process")
     status_output = gr.Textbox(label="Status", interactive=False)
-    ai_result_output = gr.JSON(label="AI Mapping Results (High-Confidence Mappings)")
-    def process_and_display(files, obj_name, *mapping_values):
-        field_names = list(manual_mapping_inputs.value.keys())
-        manual_mappings_dict = {field: value for field, value in zip(field_names, mapping_values) if value}
-        status, ai_result, updated_records, progress = process_contract(files, obj_name, manual_mappings_dict)
-        global failed_records
-        failed_records = updated_records
-        return status, ai_result if ai_result else {}, gr.update(value=progress)
     process_button.click(
         fn=process_and_display,
-        inputs=[uploaded_files, object_name] + [comp for comp in manual_mapping_inputs.value.values()],
-        outputs=[status_output, ai_result_output, upload_progress]
     )
-    with gr.Tab("Reconciliation & Retry"):
-        failed_records_output = gr.Textbox(label="Failed Records", interactive=False, value="No failed records.")
-        def update_reconciliation():
-            global failed_records
-            if failed_records:
-                return "\n".join([f"{i}: {rec['pdf_name']} - {rec['error']}" for i, rec in enumerate(failed_records)])
-            return "No failed records."
-        def retry_and_update(index, obj_name, *mapping_values):
-            field_names = list(manual_mapping_inputs.value.keys())
-            manual_mappings_dict = {field: value for field, value in zip(field_names, mapping_values) if value}
-            result, updated_records, progress = retry_failed_record(int(index), obj_name, manual_mappings_dict)
-            global failed_records
-            failed_records = updated_records
-            return result, update_reconciliation(), gr.update(value=progress)
-        retry_index = gr.Number(label="Select Failed Record Index", interactive=False)
-        retry_manual_inputs = gr.State(value={field: gr.Textbox(label=f"{field} (Retry)", interactive=True, value="") for field in manual_mapping_inputs.value.keys()})
-        retry_button = gr.Button("Retry")
-        retry_status = gr.Textbox(label="Retry Status", interactive=False)
-        retry_button.click(
-            fn=retry_and_update,
-            inputs=[retry_index, object_name] + [comp for comp in retry_manual_inputs.value.values()],
-            outputs=[retry_status, failed_records_output, upload_progress]
         )
 demo.launch()

 import gradio as gr
+from transformers import LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification
+import torch
+from PIL import Image
+import pdf2image
+import pytesseract
 import os
 import tempfile
 from tqdm import tqdm
 import subprocess
+import re
+# Initialize global state
+contract_data = {}  # In-memory repository
 failed_records = []
+processed_files = 0
+total_files = 0
+# Load pre-trained LayoutLMv3 model and tokenizer
+tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
+model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
 def check_poppler():
+    """Check if poppler-utils is installed."""
     try:
         subprocess.run(['pdftoppm', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         return True
     except FileNotFoundError:
         return False
+def extract_text_from_pdf(pdf_bytes):
+    """Convert PDF to images and extract text using OCR."""
     if not check_poppler():
+        return "Error: poppler-utils not installed. Install it (e.g., 'sudo apt-get install poppler-utils')."
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        tmp.write(pdf_bytes)
+        temp_path = tmp.name
+    try:
+        images = pdf2image.convert_from_path(temp_path)
+        text = ""
+        for img in images:
+            text += pytesseract.image_to_string(img) + "\n"
+        return text
+    except Exception as e:
+        return f"Error extracting text: {str(e)}"
+    finally:
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)
+def extract_key_data(text):
+    """Extract key data (dates, amounts, clauses) using simple regex as a mock AI."""
+    dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)
+    amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
+    clauses = re.findall(r'(?:Section|Clause)\s+\d+\.\d+\s+(.+?)(?=\n|$)', text, re.DOTALL)
+    return {"dates": dates, "amounts": amounts, "clauses": clauses}
+def detect_risks(data):
+    """Basic risk detection: flag missing dates or large amounts."""
+    risks = []
+    if not data["dates"]:
+        risks.append("No expiration date detected - potential obligation risk.")
+    if any(float(amount.replace('$', '').replace(',', '')) > 1000000 for amount in data["amounts"]):
+        risks.append("Large amount detected - review for financial risk.")
+    return risks
+def process_contract(pdf_bytes, object_type):
+    """Process contract and simulate CCI workflow."""
+    global processed_files, total_files
+    total_files = 1
     processed_files = 0
+    print(f"Received file - Starting processing")
+    text = extract_text_from_pdf(pdf_bytes)
+    if isinstance(text, str) and text.startswith("Error"):
+        return text, {}, [], "0/1"
+    key_data = extract_key_data(text)
+    risks = detect_risks(key_data)
+    status = "✅ Processed" if not risks else "⚠️ Processed with risks"
+    # Mock CLM integration with predefined fields
+    clm_fields = {"Name": "Contract_001", "Type": object_type, "Status": status}
+    clm_fields.update(key_data)
+    contract_id = f"Contract_{len(contract_data) + 1}"
+    contract_data[contract_id] = {
+        "data": key_data,
+        "risks": risks,
+        "clm_fields": clm_fields,
+        "status": status
+    }
+    processed_files = 1
+    progress = "1/1"
+    print(f"Processing completed - ID: {contract_id}, Progress: {progress}")
+    return status, key_data, risks, progress
+def search_contracts(query):
+    """Search contract repository."""
+    results = {cid: data for cid, data in contract_data.items() if query.lower() in str(data).lower()}
+    return results if results else {"No matches": "No contracts found matching the query."}
 # Gradio UI
+with gr.Blocks(title="Contract Intelligence App") as demo:
     with gr.Row():
+        file_input = gr.File(type="binary", file_types=["pdf"], file_count="single", label="Upload Contract PDF")
+        upload_progress = gr.Textbox(label="Progress", value="0/0", interactive=False)
+    object_type = gr.Dropdown(choices=["Contract", "Agreement", "Invoice"], label="Select Object Type")
+    process_button = gr.Button("Process Contract")
     status_output = gr.Textbox(label="Status", interactive=False)
+    extracted_data_output = gr.JSON(label="Extracted Data")
+    risks_output = gr.Textbox(label="Detected Risks", interactive=False)
+    def process_and_display(file, obj_type):
+        if file:
+            status, data, risks, progress = process_contract(file, obj_type)
+            return status, data, "\n".join(risks) if risks else "No risks detected", gr.update(value=progress)
+        return "❌ No file uploaded.", {}, "No risks detected", gr.update(value="0/0")
     process_button.click(
         fn=process_and_display,
+        inputs=[file_input, object_type],
+        outputs=[status_output, extracted_data_output, risks_output, upload_progress]
     )
+    with gr.Tab("Contract Repository"):
+        search_query = gr.Textbox(label="Search Contracts", placeholder="Enter keyword...")
+        search_results = gr.JSON(label="Search Results")
+        search_button = gr.Button("Search")
+        def search_and_display(query):
+            return search_contracts(query)
+        search_button.click(
+            fn=search_and_display,
+            inputs=search_query,
+            outputs=search_results
         )
 demo.launch()