pavansuresh commited on
Commit
f214078
·
verified ·
1 Parent(s): 9101271

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -159
app.py CHANGED
@@ -1,178 +1,144 @@
1
  import gradio as gr
2
- from salesforce_utils import get_salesforce_objects, get_salesforce_object_fields, get_token, create_record, attach_pdf
3
- from ai_mapping import run_ai_mapping
4
- from ocr_utils import extract_text_from_pdf
5
  import os
6
  import tempfile
7
- import json
8
-
9
- # Initialize global state for failed records
10
- failed_records = []
11
-
12
- def save_failed_record(pdf_name, object_name, error, mappings):
13
- """Log failed records for reconciliation."""
14
- global failed_records
15
- failed_records.append({
16
- "pdf_name": pdf_name,
17
- "object_name": object_name,
18
- "error": error,
19
- "mappings": mappings
20
- })
21
-
22
- def process_contract(uploaded_files, object_name, manual_mappings):
23
- """Process uploaded PDFs and create Salesforce records."""
24
- if not uploaded_files:
25
- return "❌ No files uploaded.", None, failed_records
26
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  try:
28
  token, instance_url = get_token()
 
 
 
 
 
29
  except Exception as e:
30
- return f"Salesforce authentication failed: {str(e)}", None, failed_records
31
-
32
- try:
33
- object_fields = get_salesforce_object_fields(token, instance_url, object_name)
34
- object_field_names = [field['name'] for field in object_fields if field.get('createable')]
35
- except Exception as e:
36
- return f"❌ Failed to fetch object fields: {str(e)}", None, failed_records
37
-
38
- results = []
39
- for pdf_file in uploaded_files:
40
- pdf_name = pdf_file.name
41
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
42
- tmp.write(pdf_file.getvalue())
43
- tmp_path = tmp.name
44
-
45
- try:
46
- # Step 1: OCR
47
- text_data = extract_text_from_pdf(tmp_path)
48
- if not text_data:
49
- save_failed_record(pdf_name, object_name, "No text extracted from PDF", {})
50
- results.append(f"⚠️ {pdf_name}: No text extracted")
51
- continue
52
-
53
- # Step 2: AI Mapping
54
- ai_result = run_ai_mapping(text_data, tmp_path, object_field_names)
55
- if ai_result['status'] == 'failed':
56
- save_failed_record(pdf_name, object_name, ai_result['error'], ai_result['mappings'])
57
- results.append(f"❌ {pdf_name}: {ai_result['error']}")
58
- continue
59
-
60
- # Apply manual mappings (if provided)
61
- mappings = {k: v for k, v in ai_result['mappings'].items()}
62
- for field, value in manual_mappings.items():
63
- if value and field in object_field_names:
64
- mappings[field] = value
65
-
66
- # Step 3: Create Salesforce record
67
- record_response = create_record(object_name, mappings, token, instance_url)
68
- if 'id' in record_response:
69
- attach_pdf(record_response['id'], tmp_path, token, instance_url)
70
- results.append(f"✅ {pdf_name}: Record created (ID: {record_response['id']})")
71
- else:
72
- save_failed_record(pdf_name, object_name, f"Failed to create record: {record_response}", mappings)
73
- results.append(f"❌ {pdf_name}: Failed to create record: {record_response}")
74
- except Exception as e:
75
- save_failed_record(pdf_name, object_name, str(e), {})
76
- results.append(f"❌ {pdf_name}: {str(e)}")
77
- finally:
78
- os.unlink(tmp_path)
79
-
80
- return "\n".join(results), ai_result, failed_records
81
-
82
- def retry_failed_record(index, object_name, manual_mappings):
83
- """Retry a failed record with manual corrections."""
84
- global failed_records
85
- if 0 <= index < len(failed_records):
86
- failed_record = failed_records.pop(index)
87
- pdf_name = failed_record['pdf_name']
88
- with open(pdf_name, 'rb') as f: # Adjust path if needed
89
- result, ai_result, updated_records = process_contract([f], object_name, manual_mappings)
90
- failed_records = updated_records
91
- return result, updated_records
92
- return "❌ Invalid record index.", failed_records
93
 
94
  # Gradio UI
95
- with gr.Blocks(title="Smart Contract Migrator (Gradio)") as demo:
96
- # Epic 1: PDF Upload
97
  with gr.Row():
98
- uploaded_files = gr.File(file_types=["pdf"], file_count="multiple", label="Upload Contract PDFs")
99
-
100
- # Epic 2: Salesforce Object Selection
101
- token, instance_url = get_token()
102
- objects = get_salesforce_objects(token, instance_url)
103
- object_names = [obj['name'] for obj in objects if obj.get('createable')]
104
- object_name = gr.Dropdown(choices=object_names, label="Select Salesforce Object")
105
-
106
- # Display object fields and create dynamic manual mappings
107
- object_fields_state = gr.State(value=[])
108
-
109
- def update_fields_and_mappings(selected_object):
110
- if selected_object:
111
- try:
112
- token, instance_url = get_token()
113
- object_fields = get_salesforce_object_fields(token, instance_url, selected_object)
114
- field_names = [field['name'] for field in object_fields if field.get('createable')]
115
- # Create a list of textboxes dynamically
116
- mapping_inputs = [gr.Textbox(label=f"{field}", interactive=True) for field in field_names]
117
- return field_names, mapping_inputs, gr.update(visible=True, value="\n".join(field_names))
118
- except Exception as e:
119
- return [], [], gr.update(visible=False, value=f"❌ Failed to fetch fields: {str(e)}")
120
- return [], [], gr.update(visible=False)
121
-
122
- object_fields_output = gr.Textbox(label="Available Fields", interactive=False)
123
- manual_mapping_inputs = gr.State(value=[]) # Store the list of textbox components
124
-
125
- object_name.change(
126
- fn=update_fields_and_mappings,
127
- inputs=object_name,
128
- outputs=[object_fields_state, manual_mapping_inputs, object_fields_output]
129
- )
130
 
131
- # Process button
132
- process_button = gr.Button("Extract, Map, and Upload")
133
- status_output = gr.Textbox(label="Status", interactive=False)
134
- ai_result_output = gr.JSON(label="AI Mapping Results", visible=False)
135
 
136
- def process_and_display(files, obj_name, *mapping_values):
137
- field_names = object_fields_state.value
138
- manual_mappings_dict = {field: value for field, value in zip(field_names, mapping_values) if value}
139
- status, ai_result, updated_records = process_contract(files, obj_name, manual_mappings_dict)
140
- global failed_records
141
- failed_records = updated_records
142
- return status, ai_result if ai_result else {}, len(failed_records) > 0
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  process_button.click(
145
  fn=process_and_display,
146
- inputs=[uploaded_files, object_name] + [comp for comp in manual_mapping_inputs.value],
147
- outputs=[status_output, ai_result_output, gr.State(visible=True)]
148
  )
149
 
150
- # Epic 6: Reconciliation Dashboard
151
- with gr.Tab("Reconciliation Dashboard"):
152
- failed_records_output = gr.Textbox(label="Failed Records", interactive=False, value="No failed records.")
153
-
154
- def update_reconciliation():
155
- global failed_records
156
- if failed_records:
157
- return "\n".join([f"{i}: {rec['pdf_name']} - {rec['error']}" for i, rec in enumerate(failed_records)])
158
- return "No failed records."
159
-
160
- def retry_and_update(index, obj_name, *mapping_values):
161
- manual_mappings_dict = {field: value for field, value in zip(object_fields_state.value, mapping_values) if value}
162
- result, updated_records = retry_failed_record(int(index), obj_name, manual_mappings_dict)
163
- global failed_records
164
- failed_records = updated_records
165
- return result, update_reconciliation()
166
-
167
- retry_index = gr.Number(label="Select Failed Record Index", interactive=True)
168
- retry_manual_inputs = gr.State(value=[gr.Textbox(label=f"{field} (Retry)", interactive=True) for field in object_fields_state.value])
169
- retry_button = gr.Button("Retry")
170
- retry_status = gr.Textbox(label="Retry Status", interactive=False)
171
-
172
- retry_button.click(
173
- fn=retry_and_update,
174
- inputs=[retry_index, object_name] + [comp for comp in retry_manual_inputs.value],
175
- outputs=[retry_status, failed_records_output]
176
  )
177
 
178
  demo.launch()
 
1
  import gradio as gr
2
+ from transformers import LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification
3
+ import torch
4
+ from PIL import Image
5
  import os
6
  import tempfile
7
+ from tqdm import tqdm
8
+ import re
9
+ from ai_mapping import extract_key_values_with_layoutlm, run_ai_mapping_with_layoutlm
10
+ from ocr_utils import extract_text_from_pdf_with_tesseract_or_layoutlm
11
+ from salesforce_utils import get_token, create_or_update_record
12
+
13
+ # Initialize global state
14
+ contract_data = {} # In-memory contract repository
15
+ processed_files = 0
16
+ total_files = 0
17
+
18
+ # Load pre-trained LayoutLMv3 model and tokenizer (placeholder for future fine-tuning)
19
+ tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
20
+ model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
21
+
22
+ def save_temp_file(pdf_bytes):
23
+ """Save PDF bytes to a temporary file and return the path."""
24
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
25
+ tmp.write(pdf_bytes)
26
+ return tmp.name
27
+
28
+ def detect_risks(data):
29
+ """Detect risks (e.g., missing dates, large amounts)."""
30
+ risks = []
31
+ if not data.get("Date"):
32
+ risks.append("No expiration date detected - potential obligation risk.")
33
+ if data.get("Amount") and float(data.get("Amount", "0").replace('$', '').replace(',', '')) > 1000000:
34
+ risks.append("Large amount detected - review for financial risk.")
35
+ return risks
36
+
37
+ def process_contract(pdf_bytes, object_type):
38
+ """Process contract and simulate CCI workflow."""
39
+ global processed_files, total_files
40
+ total_files = 1
41
+ processed_files = 0
42
+
43
+ print("Received file - Starting processing")
44
+ temp_path = save_temp_file(pdf_bytes)
45
+ print(f"Temporary file created at: {temp_path}")
46
+ page_data = extract_text_from_pdf_with_tesseract_or_layoutlm(temp_path)
47
+ print(f"OCR result pages: {len(page_data)}")
48
+ if not page_data or all("No text detected" in page["text"] for page in page_data):
49
+ os.unlink(temp_path)
50
+ print("No text extracted from PDF.")
51
+ return "❌ No text extracted from PDF.", {}, [], "0/1"
52
+
53
+ print("Extracting key data")
54
+ key_data = extract_key_values_with_layoutlm(page_data, temp_path)
55
+ print(f"Key data extracted: {key_data}")
56
+ if "status" in key_data and key_data["status"] == "failed":
57
+ os.unlink(temp_path)
58
+ print(f"Extraction failed: {key_data.get('error', 'Unknown error')}")
59
+ return f"❌ Extraction failed: {key_data.get('error', 'Unknown error')}", {}, [], "0/1"
60
+
61
+ print("Detecting risks")
62
+ risks = detect_risks(key_data)
63
+ print(f"Detected risks: {risks}")
64
+ status = "✅ Processed" if not risks else "⚠️ Processed with risks"
65
+
66
+ # Mock CLM fields with Salesforce-ready structure
67
+ clm_fields = {"Name": f"Contract_{len(contract_data) + 1}", "Type__c": object_type, "Status__c": status}
68
+ clm_fields.update({k: v for k, v in key_data.items() if k not in ["status", "error", "key_values"]})
69
+
70
+ # Optional Salesforce sync
71
  try:
72
  token, instance_url = get_token()
73
+ sf_response = create_or_update_record(f"{object_type}__c", clm_fields, token, instance_url)
74
+ if "error" in sf_response:
75
+ print(f"Salesforce sync failed: {sf_response['error']}")
76
+ else:
77
+ print(f"Salesforce sync successful: {sf_response}")
78
  except Exception as e:
79
+ print(f"Salesforce sync error: {str(e)}")
80
+
81
+ contract_id = f"Contract_{len(contract_data) + 1}"
82
+ contract_data[contract_id] = {
83
+ "data": key_data,
84
+ "risks": risks,
85
+ "clm_fields": clm_fields,
86
+ "status": status
87
+ }
88
+ processed_files = 1
89
+ progress = "1/1"
90
+ print(f"Processing completed - ID: {contract_id}, Progress: {progress}")
91
+ os.unlink(temp_path)
92
+
93
+ return status, key_data, risks, progress
94
+
95
+ def search_contracts(query):
96
+ """Search contract repository."""
97
+ results = {cid: data for cid, data in contract_data.items() if query.lower() in str(data).lower()}
98
+ return results if results else {"No matches": "No contracts found matching the query."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  # Gradio UI
101
+ with gr.Blocks(title="Contract Intelligence App") as demo:
 
102
  with gr.Row():
103
+ file_input = gr.File(type="binary", file_types=["pdf"], file_count="multiple", label="Upload Contracts")
104
+ upload_progress = gr.Textbox(label="Progress", value="0/0", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ object_type = gr.Dropdown(choices=["Contract", "Agreement", "Invoice"], label="Select Object Type")
 
 
 
107
 
108
+ process_button = gr.Button("Process Contracts")
109
+ status_output = gr.Textbox(label="Status", interactive=False)
110
+ extracted_data_output = gr.JSON(label="Extracted Data")
111
+ risks_output = gr.Textbox(label="Detected Risks", interactive=False)
112
+
113
+ def process_and_display(files, obj_type):
114
+ if not files:
115
+ return "❌ No files uploaded.", {}, "No risks detected", gr.update(value="0/0")
116
+ results = []
117
+ all_data = {}
118
+ all_risks = []
119
+ for i, file in enumerate(files):
120
+ status, data, risks, _ = process_contract(file, obj_type)
121
+ results.append(f"{status} - File: File_{i}")
122
+ all_data.update({f"File_{i}": data})
123
+ all_risks.extend(risks)
124
+ progress = f"{len(files)}/{len(files)}"
125
+ return "\n".join(results), all_data, "\n".join(all_risks) if all_risks else "No risks detected", gr.update(value=progress)
126
 
127
  process_button.click(
128
  fn=process_and_display,
129
+ inputs=[file_input, object_type],
130
+ outputs=[status_output, extracted_data_output, risks_output, upload_progress]
131
  )
132
 
133
+ with gr.Tab("Contract Repository"):
134
+ search_query = gr.Textbox(label="Search Contracts", placeholder="Enter keyword...")
135
+ search_results = gr.JSON(label="Search Results")
136
+ search_button = gr.Button("Search")
137
+
138
+ search_button.click(
139
+ fn=search_contracts,
140
+ inputs=search_query,
141
+ outputs=search_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  )
143
 
144
  demo.launch()