pavansuresh commited on
Commit
8f3b77b
·
verified ·
1 Parent(s): 9230bf8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -178
app.py CHANGED
@@ -1,210 +1,142 @@
1
  import gradio as gr
2
- from ai_mapping import run_ai_mapping_with_layoutlm, extract_key_values_with_layoutlm
3
- from ocr_utils import extract_text_from_pdf_with_tesseract_or_layoutlm
 
 
 
4
  import os
5
  import tempfile
6
  from tqdm import tqdm
7
  import subprocess
 
8
 
9
- # Initialize global state for failed records and uploaded files
 
10
  failed_records = []
11
- uploaded_file_details = {}
 
12
 
13
- def is_pdf_file(file_bytes):
14
- """Check if the file is a valid PDF by reading the header from bytes."""
15
- valid_pdf_header = b'%PDF-'
16
- return file_bytes.startswith(valid_pdf_header) if file_bytes else False
17
 
18
  def check_poppler():
19
- """Check if poppler-utils is installed and in PATH."""
20
  try:
21
  subprocess.run(['pdftoppm', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
22
  return True
23
  except FileNotFoundError:
24
  return False
25
 
26
- def save_failed_record(pdf_name, object_name, error, mappings):
27
- """Log failed records for reconciliation."""
28
- global failed_records
29
- failed_records.append({
30
- "pdf_name": pdf_name,
31
- "object_name": object_name,
32
- "error": error,
33
- "mappings": mappings
34
- })
35
-
36
- def save_uploaded_file_details(pdf_name, temp_path):
37
- """Store file details securely."""
38
- global uploaded_file_details
39
- uploaded_file_details[pdf_name] = {"temp_path": temp_path, "processed": False}
40
-
41
- def process_contract(uploaded_files, object_name, manual_mappings):
42
- """Process uploaded PDFs locally with mock Salesforce object fields."""
43
- if not uploaded_files:
44
- return "❌ No files uploaded.", None, failed_records, "0/0"
45
-
46
- # Debug: Log uploaded files and their raw data
47
- print(f"Received files (bytes): {len(uploaded_files)} files at {len(uploaded_files)}")
48
- for i, file_bytes in enumerate(uploaded_files):
49
- print(f"File {i} header: {file_bytes[:5]} - Starting processing")
50
-
51
- # Check for poppler-utils
52
  if not check_poppler():
53
- return "Error: poppler-utils is not installed or not in PATH. Please install it (e.g., 'sudo apt-get install poppler-utils' on Linux).", None, failed_records, "0/0"
54
-
55
- # Mock Salesforce object fields
56
- mock_object_fields = ["Name", "Description", "Amount", "Date"] if object_name else []
57
- total_files = len(uploaded_files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  processed_files = 0
59
- results = []
60
- ai_result = None
61
- with tqdm(total=total_files, desc="Processing PDFs") as pbar:
62
- for i, file_bytes in enumerate(uploaded_files):
63
- pdf_name = f"uploaded_file_{i}.pdf"
64
- if not is_pdf_file(file_bytes):
65
- save_failed_record(pdf_name, object_name, "Invalid PDF content", {})
66
- results.append(f"❌ {pdf_name}: Invalid PDF content")
67
- processed_files += 1
68
- pbar.update(1)
69
- continue
70
-
71
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
72
- tmp.write(file_bytes)
73
- temp_path = tmp.name
74
- save_uploaded_file_details(pdf_name, temp_path)
75
-
76
- try:
77
- print(f"Processing {pdf_name} - OCR stage")
78
- text_data = extract_text_from_pdf_with_tesseract_or_layoutlm(temp_path)
79
- if not text_data:
80
- save_failed_record(pdf_name, object_name, "No text extracted from PDF", {})
81
- results.append(f"⚠️ {pdf_name}: No text extracted")
82
- processed_files += 1
83
- pbar.update(1)
84
- continue
85
-
86
- print(f"Processing {pdf_name} - AI mapping stage")
87
- key_values = extract_key_values_with_layoutlm(text_data, temp_path)
88
- ai_result = run_ai_mapping_with_layoutlm(key_values, mock_object_fields, temp_path)
89
- if ai_result['status'] == 'failed':
90
- save_failed_record(pdf_name, object_name, ai_result['error'], ai_result['mappings'])
91
- results.append(f"❌ {pdf_name}: {ai_result['error']}")
92
- processed_files += 1
93
- pbar.update(1)
94
- continue
95
-
96
- mappings = {k: v for k, v in ai_result['mappings'].items()}
97
- for field, value in manual_mappings.items():
98
- if value and field in mock_object_fields:
99
- mappings[field] = value
100
-
101
- results.append(f"✅ {pdf_name}: Data processed locally (Mock ID: {hash(pdf_name)})")
102
- processed_files += 1
103
- pbar.update(1)
104
- except Exception as e:
105
- save_failed_record(pdf_name, object_name, str(e), {})
106
- results.append(f"❌ {pdf_name}: {str(e)}")
107
- processed_files += 1
108
- pbar.update(1)
109
- finally:
110
- if os.path.exists(temp_path):
111
- os.unlink(temp_path)
112
- uploaded_file_details[pdf_name]["processed"] = True
113
-
114
- progress = f"{processed_files}/{total_files}"
115
- print(f"Processing completed - Results: {results}, Progress: {progress}")
116
- return "\n".join(results), ai_result, failed_records, progress
117
-
118
- def retry_failed_record(index, object_name, manual_mappings):
119
- """Retry a failed record with manual corrections."""
120
- global failed_records, uploaded_file_details
121
- if 0 <= index < len(failed_records):
122
- failed_record = failed_records.pop(index)
123
- pdf_name = failed_record['pdf_name']
124
- temp_path = uploaded_file_details.get(pdf_name, {}).get("temp_path")
125
- if temp_path and os.path.exists(temp_path):
126
- with open(temp_path, 'rb') as f:
127
- result, ai_result, updated_records, progress = process_contract([f.read()], object_name, manual_mappings)
128
- failed_records = updated_records
129
- return result, updated_records, progress
130
- return "❌ File not found for retry.", failed_records, "0/1"
131
- return "❌ Invalid record index.", failed_records, "0/1"
132
 
133
  # Gradio UI
134
- with gr.Blocks(title="Smart Contract Migrator (Local Mode)") as demo:
135
  with gr.Row():
136
- uploaded_files = gr.File(type="binary", file_types=["pdf"], file_count="multiple", label="Upload Contract PDFs")
137
- upload_progress = gr.Textbox(label="Upload Progress", value="0/0", interactive=False)
138
-
139
- object_name = gr.Dropdown(choices=["Contract", "Invoice", "Agreement"], label="Select Object Type (Mock)")
140
-
141
- def update_fields(selected_object):
142
- if selected_object:
143
- mock_fields = ["Name", "Description", "Amount", "Date"]
144
- return gr.update(visible=True, value="\n".join(mock_fields))
145
- return gr.update(visible=False)
146
-
147
- object_fields_output = gr.Textbox(label="Available Fields (Mock)", interactive=False)
148
- object_name.change(fn=update_fields, inputs=object_name, outputs=object_fields_output)
149
-
150
- manual_mapping_inputs = gr.State(value={})
151
- def update_manual_mappings(selected_object):
152
- if selected_object:
153
- mock_fields = ["Name", "Description", "Amount", "Date"]
154
- mapping_inputs = {field: gr.Textbox(label=f"{field} (Manual Correction)", interactive=True, value="") for field in mock_fields}
155
- return mapping_inputs
156
- return {}
157
-
158
- object_name.change(
159
- fn=update_manual_mappings,
160
- inputs=object_name,
161
- outputs=manual_mapping_inputs
162
- )
163
 
164
- process_button = gr.Button("Extract, Map, and Process")
 
 
165
  status_output = gr.Textbox(label="Status", interactive=False)
166
- ai_result_output = gr.JSON(label="AI Mapping Results (High-Confidence Mappings)")
 
167
 
168
- def process_and_display(files, obj_name, *mapping_values):
169
- field_names = list(manual_mapping_inputs.value.keys())
170
- manual_mappings_dict = {field: value for field, value in zip(field_names, mapping_values) if value}
171
- status, ai_result, updated_records, progress = process_contract(files, obj_name, manual_mappings_dict)
172
- global failed_records
173
- failed_records = updated_records
174
- return status, ai_result if ai_result else {}, gr.update(value=progress)
175
 
176
  process_button.click(
177
  fn=process_and_display,
178
- inputs=[uploaded_files, object_name] + [comp for comp in manual_mapping_inputs.value.values()],
179
- outputs=[status_output, ai_result_output, upload_progress]
180
  )
181
 
182
- with gr.Tab("Reconciliation & Retry"):
183
- failed_records_output = gr.Textbox(label="Failed Records", interactive=False, value="No failed records.")
184
-
185
- def update_reconciliation():
186
- global failed_records
187
- if failed_records:
188
- return "\n".join([f"{i}: {rec['pdf_name']} - {rec['error']}" for i, rec in enumerate(failed_records)])
189
- return "No failed records."
190
-
191
- def retry_and_update(index, obj_name, *mapping_values):
192
- field_names = list(manual_mapping_inputs.value.keys())
193
- manual_mappings_dict = {field: value for field, value in zip(field_names, mapping_values) if value}
194
- result, updated_records, progress = retry_failed_record(int(index), obj_name, manual_mappings_dict)
195
- global failed_records
196
- failed_records = updated_records
197
- return result, update_reconciliation(), gr.update(value=progress)
198
-
199
- retry_index = gr.Number(label="Select Failed Record Index", interactive=False)
200
- retry_manual_inputs = gr.State(value={field: gr.Textbox(label=f"{field} (Retry)", interactive=True, value="") for field in manual_mapping_inputs.value.keys()})
201
- retry_button = gr.Button("Retry")
202
- retry_status = gr.Textbox(label="Retry Status", interactive=False)
203
-
204
- retry_button.click(
205
- fn=retry_and_update,
206
- inputs=[retry_index, object_name] + [comp for comp in retry_manual_inputs.value.values()],
207
- outputs=[retry_status, failed_records_output, upload_progress]
208
  )
209
 
210
  demo.launch()
 
1
  import gradio as gr
2
+ from transformers import LayoutLMv3Tokenizer, LayoutLMv3ForTokenClassification
3
+ import torch
4
+ from PIL import Image
5
+ import pdf2image
6
+ import pytesseract
7
  import os
8
  import tempfile
9
  from tqdm import tqdm
10
  import subprocess
11
+ import re
12
 
13
+ # Initialize global state
14
+ contract_data = {} # In-memory repository
15
  failed_records = []
16
+ processed_files = 0
17
+ total_files = 0
18
 
19
+ # Load pre-trained LayoutLMv3 model and tokenizer
20
+ tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
21
+ model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
 
22
 
23
  def check_poppler():
24
+ """Check if poppler-utils is installed."""
25
  try:
26
  subprocess.run(['pdftoppm', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
27
  return True
28
  except FileNotFoundError:
29
  return False
30
 
31
+ def extract_text_from_pdf(pdf_bytes):
32
+ """Convert PDF to images and extract text using OCR."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  if not check_poppler():
34
+ return "Error: poppler-utils not installed. Install it (e.g., 'sudo apt-get install poppler-utils')."
35
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
36
+ tmp.write(pdf_bytes)
37
+ temp_path = tmp.name
38
+ try:
39
+ images = pdf2image.convert_from_path(temp_path)
40
+ text = ""
41
+ for img in images:
42
+ text += pytesseract.image_to_string(img) + "\n"
43
+ return text
44
+ except Exception as e:
45
+ return f"Error extracting text: {str(e)}"
46
+ finally:
47
+ if os.path.exists(temp_path):
48
+ os.unlink(temp_path)
49
+
50
+ def extract_key_data(text):
51
+ """Extract key data (dates, amounts, clauses) using simple regex as a mock AI."""
52
+ dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)
53
+ amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
54
+ clauses = re.findall(r'(?:Section|Clause)\s+\d+\.\d+\s+(.+?)(?=\n|$)', text, re.DOTALL)
55
+ return {"dates": dates, "amounts": amounts, "clauses": clauses}
56
+
57
+ def detect_risks(data):
58
+ """Basic risk detection: flag missing dates or large amounts."""
59
+ risks = []
60
+ if not data["dates"]:
61
+ risks.append("No expiration date detected - potential obligation risk.")
62
+ if any(float(amount.replace('$', '').replace(',', '')) > 1000000 for amount in data["amounts"]):
63
+ risks.append("Large amount detected - review for financial risk.")
64
+ return risks
65
+
66
+ def process_contract(pdf_bytes, object_type):
67
+ """Process contract and simulate CCI workflow."""
68
+ global processed_files, total_files
69
+ total_files = 1
70
  processed_files = 0
71
+
72
+ print(f"Received file - Starting processing")
73
+ text = extract_text_from_pdf(pdf_bytes)
74
+ if isinstance(text, str) and text.startswith("Error"):
75
+ return text, {}, [], "0/1"
76
+
77
+ key_data = extract_key_data(text)
78
+ risks = detect_risks(key_data)
79
+ status = "✅ Processed" if not risks else "⚠️ Processed with risks"
80
+
81
+ # Mock CLM integration with predefined fields
82
+ clm_fields = {"Name": "Contract_001", "Type": object_type, "Status": status}
83
+ clm_fields.update(key_data)
84
+
85
+ contract_id = f"Contract_{len(contract_data) + 1}"
86
+ contract_data[contract_id] = {
87
+ "data": key_data,
88
+ "risks": risks,
89
+ "clm_fields": clm_fields,
90
+ "status": status
91
+ }
92
+ processed_files = 1
93
+ progress = "1/1"
94
+ print(f"Processing completed - ID: {contract_id}, Progress: {progress}")
95
+
96
+ return status, key_data, risks, progress
97
+
98
+ def search_contracts(query):
99
+ """Search contract repository."""
100
+ results = {cid: data for cid, data in contract_data.items() if query.lower() in str(data).lower()}
101
+ return results if results else {"No matches": "No contracts found matching the query."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  # Gradio UI
104
+ with gr.Blocks(title="Contract Intelligence App") as demo:
105
  with gr.Row():
106
+ file_input = gr.File(type="binary", file_types=["pdf"], file_count="single", label="Upload Contract PDF")
107
+ upload_progress = gr.Textbox(label="Progress", value="0/0", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ object_type = gr.Dropdown(choices=["Contract", "Agreement", "Invoice"], label="Select Object Type")
110
+
111
+ process_button = gr.Button("Process Contract")
112
  status_output = gr.Textbox(label="Status", interactive=False)
113
+ extracted_data_output = gr.JSON(label="Extracted Data")
114
+ risks_output = gr.Textbox(label="Detected Risks", interactive=False)
115
 
116
+ def process_and_display(file, obj_type):
117
+ if file:
118
+ status, data, risks, progress = process_contract(file, obj_type)
119
+ return status, data, "\n".join(risks) if risks else "No risks detected", gr.update(value=progress)
120
+ return "❌ No file uploaded.", {}, "No risks detected", gr.update(value="0/0")
 
 
121
 
122
  process_button.click(
123
  fn=process_and_display,
124
+ inputs=[file_input, object_type],
125
+ outputs=[status_output, extracted_data_output, risks_output, upload_progress]
126
  )
127
 
128
+ with gr.Tab("Contract Repository"):
129
+ search_query = gr.Textbox(label="Search Contracts", placeholder="Enter keyword...")
130
+ search_results = gr.JSON(label="Search Results")
131
+ search_button = gr.Button("Search")
132
+
133
+ def search_and_display(query):
134
+ return search_contracts(query)
135
+
136
+ search_button.click(
137
+ fn=search_and_display,
138
+ inputs=search_query,
139
+ outputs=search_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  )
141
 
142
  demo.launch()