pavansuresh commited on
Commit
428bcb4
·
verified ·
1 Parent(s): dbc2871

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -78
app.py CHANGED
@@ -1,81 +1,144 @@
1
- import pdfplumber
 
 
 
 
 
 
2
  import re
3
- from datetime import datetime
4
- import logging
5
-
6
- # Configure logging
7
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
8
-
9
- def extract_data_from_pdf(pdf_path):
10
- data = {
11
- "Agreement Name": "",
12
- "Agreement Start Date": "",
13
- "Agreement End Date": "",
14
- "Amount": "",
15
- "Clauses": []
16
- }
17
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  try:
19
- with pdfplumber.open(pdf_path) as pdf:
20
- for page in pdf.pages:
21
- text = page.extract_text()
22
- if not text:
23
- logging.warning("No text extracted from page.")
24
- continue
25
-
26
- # Extract Agreement Name (look for lines with "Agreement" near the top)
27
- if not data["Agreement Name"]:
28
- lines = text.split("\n")
29
- for line in lines[:5]: # Check first 5 lines for title
30
- if "Agreement" in line:
31
- data["Agreement Name"] = line.strip()
32
- break
33
-
34
- # Extract Dates using regex (supports MM/DD/YYYY or DD-MM-YYYY)
35
- date_pattern = r'\d{1,2}[/-]\d{1,2}[/-]\d{4}'
36
- for line in text.split("\n"):
37
- if "Start Date" in line and not data["Agreement Start Date"]:
38
- match = re.search(date_pattern, line)
39
- if match:
40
- data["Agreement Start Date"] = match.group(0)
41
- if "End Date" in line and not data["Agreement End Date"]:
42
- match = re.search(date_pattern, line)
43
- if match:
44
- data["Agreement End Date"] = match.group(0)
45
-
46
- # Extract Amount using regex (supports $XXX,XXX.XX format)
47
- if not data["Amount"]:
48
- amount_pattern = r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?'
49
- match = re.search(amount_pattern, text)
50
- if match:
51
- data["Amount"] = match.group(0)
52
-
53
- # Extract Clauses (look for numbered or bulleted lists)
54
- clause_markers = r'^\s*(?:\d+\.|-|\•)\s*'
55
- for line in text.split("\n"):
56
- if re.match(clause_markers, line) and line.strip():
57
- data["Clauses"].append(line.strip())
58
-
59
  except Exception as e:
60
- logging.error(f"Error extracting data from {pdf_path}: {e}")
61
-
62
- return data
63
-
64
- def main():
65
- pdf_path = "MSA_test_1_Original_MSA_2025-07-15.00.pdf" # Replace with dynamic path if needed
66
- current_time = datetime.now().strftime("%I:%M %p IST on %A, %B %d, %Y")
67
- logging.info(f"Starting extraction at {current_time}")
68
-
69
- extracted_data = extract_data_from_pdf(pdf_path)
70
-
71
- # Display or process extracted data
72
- for key, value in extracted_data.items():
73
- logging.info(f"{key}: {value if value else 'Not found'}")
74
-
75
- # Example: Save to a file or return for further processing
76
- with open("extracted_data.txt", "w") as f:
77
- for key, value in extracted_data.items():
78
- f.write(f"{key}: {value if value else 'Not found'}\n")
79
-
80
- if __name__ == "__main__":
81
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification
3
+ import torch
4
+ from PIL import Image
5
+ import os
6
+ import tempfile
7
+ from tqdm import tqdm
8
  import re
9
+ from ai_mapping import extract_key_values_with_layoutlm, run_ai_mapping_with_layoutlm
10
+ from ocr_utils import extract_text_from_pdf_with_tesseract_or_layoutlm
11
+ from salesforce_utils import get_token, create_or_update_record
12
+
13
+ # Initialize global state
14
+ contract_data = {} # In-memory contract repository
15
+ processed_files = 0
16
+ total_files = 0
17
+
18
+ # Load pre-trained LayoutLMv3 model and tokenizer (placeholder for future fine-tuning)
19
+ tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
20
+ model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
21
+
22
+ def save_temp_file(pdf_bytes):
23
+ """Save PDF bytes to a temporary file and return the path."""
24
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
25
+ tmp.write(pdf_bytes)
26
+ return tmp.name
27
+
28
+ def detect_risks(data):
29
+ """Detect risks (e.g., missing dates, large amounts)."""
30
+ risks = []
31
+ if not data.get("Date"):
32
+ risks.append("No expiration date detected - potential obligation risk.")
33
+ if data.get("Amount") and float(data.get("Amount", "0").replace('$', '').replace(',', '')) > 1000000:
34
+ risks.append("Large amount detected - review for financial risk.")
35
+ return risks
36
+
37
+ def process_contract(pdf_bytes, object_type):
38
+ """Process contract and simulate CCI workflow."""
39
+ global processed_files, total_files
40
+ total_files = 1
41
+ processed_files = 0
42
+
43
+ print("Received file - Starting processing")
44
+ temp_path = save_temp_file(pdf_bytes)
45
+ print(f"Temporary file created at: {temp_path}")
46
+ page_data = extract_text_from_pdf_with_tesseract_or_layoutlm(temp_path)
47
+ print(f"OCR result pages: {len(page_data)}")
48
+ if not page_data or all("No text detected" in page["text"] for page in page_data):
49
+ os.unlink(temp_path)
50
+ print("No text extracted from PDF.")
51
+ return "❌ No text extracted from PDF.", {}, [], "0/1"
52
+
53
+ print("Extracting key data")
54
+ key_data = extract_key_values_with_layoutlm(page_data, temp_path)
55
+ print(f"Key data extracted: {key_data}")
56
+ if "status" in key_data and key_data["status"] == "failed":
57
+ os.unlink(temp_path)
58
+ print(f"Extraction failed: {key_data.get('error', 'Unknown error')}")
59
+ return f"❌ Extraction failed: {key_data.get('error', 'Unknown error')}", {}, [], "0/1"
60
+
61
+ print("Detecting risks")
62
+ risks = detect_risks(key_data)
63
+ print(f"Detected risks: {risks}")
64
+ status = "✅ Processed" if not risks else "⚠️ Processed with risks"
65
+
66
+ # Mock CLM fields with Salesforce-ready structure
67
+ clm_fields = {"Name": f"Contract_{len(contract_data) + 1}", "Type__c": object_type, "Status__c": status}
68
+ clm_fields.update({k: v for k, v in key_data.items() if k not in ["status", "error", "key_values"]})
69
+
70
+ # Optional Salesforce sync
71
  try:
72
+ token, instance_url = get_token()
73
+ sf_response = create_or_update_record(f"{object_type}__c", clm_fields, token, instance_url)
74
+ if "error" in sf_response:
75
+ print(f"Salesforce sync failed: {sf_response['error']}")
76
+ else:
77
+ print(f"Salesforce sync successful: {sf_response}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  except Exception as e:
79
+ print(f"Salesforce sync error: {str(e)}")
80
+
81
+ contract_id = f"Contract_{len(contract_data) + 1}"
82
+ contract_data[contract_id] = {
83
+ "data": key_data,
84
+ "risks": risks,
85
+ "clm_fields": clm_fields,
86
+ "status": status
87
+ }
88
+ processed_files = 1
89
+ progress = "1/1"
90
+ print(f"Processing completed - ID: {contract_id}, Progress: {progress}")
91
+ os.unlink(temp_path)
92
+
93
+ return status, key_data, risks, progress
94
+
95
+ def search_contracts(query):
96
+ """Search contract repository."""
97
+ results = {cid: data for cid, data in contract_data.items() if query.lower() in str(data).lower()}
98
+ return results if results else {"No matches": "No contracts found matching the query."}
99
+
100
+ # Gradio UI
101
+ with gr.Blocks(title="Contract Intelligence App") as demo:
102
+ with gr.Row():
103
+ file_input = gr.File(type="binary", file_types=["pdf"], file_count="multiple", label="Upload Contracts")
104
+ upload_progress = gr.Textbox(label="Progress", value="0/0", interactive=False)
105
+
106
+ object_type = gr.Dropdown(choices=["Contract", "Agreement", "Invoice"], label="Select Object Type")
107
+
108
+ process_button = gr.Button("Process Contracts")
109
+ status_output = gr.Textbox(label="Status", interactive=False)
110
+ extracted_data_output = gr.JSON(label="Extracted Data")
111
+ risks_output = gr.Textbox(label="Detected Risks", interactive=False)
112
+
113
+ def process_and_display(files, obj_type):
114
+ if not files:
115
+ return "❌ No files uploaded.", {}, "No risks detected", gr.update(value="0/0")
116
+ results = []
117
+ all_data = {}
118
+ all_risks = []
119
+ for i, file in enumerate(files):
120
+ status, data, risks, _ = process_contract(file, obj_type)
121
+ results.append(f"{status} - File: File_{i}")
122
+ all_data.update({f"File_{i}": data})
123
+ all_risks.extend(risks)
124
+ progress = f"{len(files)}/{len(files)}"
125
+ return "\n".join(results), all_data, "\n".join(all_risks) if all_risks else "No risks detected", gr.update(value=progress)
126
+
127
+ process_button.click(
128
+ fn=process_and_display,
129
+ inputs=[file_input, object_type],
130
+ outputs=[status_output, extracted_data_output, risks_output, upload_progress]
131
+ )
132
+
133
+ with gr.Tab("Contract Repository"):
134
+ search_query = gr.Textbox(label="Search Contracts", placeholder="Enter keyword...")
135
+ search_results = gr.JSON(label="Search Results")
136
+ search_button = gr.Button("Search")
137
+
138
+ search_button.click(
139
+ fn=search_contracts,
140
+ inputs=search_query,
141
+ outputs=search_results
142
+ )
143
+
144
+ demo.launch()