pavansuresh commited on
Commit
04bf8c5
·
verified ·
1 Parent(s): c8b6167

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -22
app.py CHANGED
@@ -11,12 +11,11 @@ import subprocess
11
  import re
12
 
13
  # Initialize global state
14
- contract_data = {} # In-memory repository
15
- failed_records = []
16
  processed_files = 0
17
  total_files = 0
18
 
19
- # Load pre-trained LayoutLMv3 model and tokenizer
20
  tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
21
  model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
22
 
@@ -50,24 +49,22 @@ def extract_text_from_pdf(pdf_bytes):
50
  text = ""
51
  for img in images:
52
  text += pytesseract.image_to_string(img) + "\n"
53
- print(f"OCR completed - Extracted text length: {len(text)}")
54
  return text
55
  except Exception as e:
56
- print(f"OCR failed: {str(e)}")
57
  return f"Error extracting text: {str(e)}"
58
  finally:
59
  if os.path.exists(temp_path):
60
  os.unlink(temp_path)
61
 
62
  def extract_key_data(text):
63
- """Extract key data (dates, amounts, clauses) using simple regex as a mock AI."""
64
  dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)
65
  amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
66
  clauses = re.findall(r'(?:Section|Clause)\s+\d+\.\d+\s+(.+?)(?=\n|$)', text, re.DOTALL)
67
  return {"dates": dates, "amounts": amounts, "clauses": clauses}
68
 
69
  def detect_risks(data):
70
- """Basic risk detection: flag missing dates or large amounts."""
71
  risks = []
72
  if not data["dates"]:
73
  risks.append("No expiration date detected - potential obligation risk.")
@@ -81,19 +78,19 @@ def process_contract(pdf_bytes, object_type):
81
  total_files = 1
82
  processed_files = 0
83
 
84
- print(f"Received file - Starting processing")
85
  text = extract_text_from_pdf(pdf_bytes)
86
  if isinstance(text, str) and text.startswith("Error"):
87
  return text, {}, [], "0/1"
88
 
89
- print(f"Extracting key data")
90
  key_data = extract_key_data(text)
91
- print(f"Detecting risks")
92
  risks = detect_risks(key_data)
93
  status = "✅ Processed" if not risks else "⚠️ Processed with risks"
94
 
95
  # Mock CLM integration with predefined fields
96
- clm_fields = {"Name": "Contract_001", "Type": object_type, "Status": status}
97
  clm_fields.update(key_data)
98
 
99
  contract_id = f"Contract_{len(contract_data) + 1}"
@@ -117,21 +114,29 @@ def search_contracts(query):
117
  # Gradio UI
118
  with gr.Blocks(title="Contract Intelligence App") as demo:
119
  with gr.Row():
120
- file_input = gr.File(type="binary", file_types=["pdf"], file_count="single", label="Upload Contract PDF")
121
  upload_progress = gr.Textbox(label="Progress", value="0/0", interactive=False)
122
 
123
  object_type = gr.Dropdown(choices=["Contract", "Agreement", "Invoice"], label="Select Object Type")
124
 
125
- process_button = gr.Button("Process Contract")
126
  status_output = gr.Textbox(label="Status", interactive=False)
127
  extracted_data_output = gr.JSON(label="Extracted Data")
128
  risks_output = gr.Textbox(label="Detected Risks", interactive=False)
129
 
130
- def process_and_display(file, obj_type):
131
- if file:
132
- status, data, risks, progress = process_contract(file, obj_type)
133
- return status, data, "\n".join(risks) if risks else "No risks detected", gr.update(value=progress)
134
- return "❌ No file uploaded.", {}, "No risks detected", gr.update(value="0/0")
 
 
 
 
 
 
 
 
135
 
136
  process_button.click(
137
  fn=process_and_display,
@@ -144,11 +149,8 @@ with gr.Blocks(title="Contract Intelligence App") as demo:
144
  search_results = gr.JSON(label="Search Results")
145
  search_button = gr.Button("Search")
146
 
147
- def search_and_display(query):
148
- return search_contracts(query)
149
-
150
  search_button.click(
151
- fn=search_and_display,
152
  inputs=search_query,
153
  outputs=search_results
154
  )
 
11
  import re
12
 
13
  # Initialize global state
14
+ contract_data = {} # In-memory contract repository
 
15
  processed_files = 0
16
  total_files = 0
17
 
18
+ # Load pre-trained LayoutLMv3 model and tokenizer (placeholder for future fine-tuning)
19
  tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
20
  model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
21
 
 
49
  text = ""
50
  for img in images:
51
  text += pytesseract.image_to_string(img) + "\n"
 
52
  return text
53
  except Exception as e:
 
54
  return f"Error extracting text: {str(e)}"
55
  finally:
56
  if os.path.exists(temp_path):
57
  os.unlink(temp_path)
58
 
59
  def extract_key_data(text):
60
+ """Extract key data (dates, amounts, clauses) using regex as a mock AI."""
61
  dates = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', text)
62
  amounts = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', text)
63
  clauses = re.findall(r'(?:Section|Clause)\s+\d+\.\d+\s+(.+?)(?=\n|$)', text, re.DOTALL)
64
  return {"dates": dates, "amounts": amounts, "clauses": clauses}
65
 
66
  def detect_risks(data):
67
+ """Detect risks (e.g., missing dates, large amounts)."""
68
  risks = []
69
  if not data["dates"]:
70
  risks.append("No expiration date detected - potential obligation risk.")
 
78
  total_files = 1
79
  processed_files = 0
80
 
81
+ print("Received file - Starting processing")
82
  text = extract_text_from_pdf(pdf_bytes)
83
  if isinstance(text, str) and text.startswith("Error"):
84
  return text, {}, [], "0/1"
85
 
86
+ print("Extracting key data")
87
  key_data = extract_key_data(text)
88
+ print("Detecting risks")
89
  risks = detect_risks(key_data)
90
  status = "✅ Processed" if not risks else "⚠️ Processed with risks"
91
 
92
  # Mock CLM integration with predefined fields
93
+ clm_fields = {"Name": f"Contract_{len(contract_data) + 1}", "Type": object_type, "Status": status}
94
  clm_fields.update(key_data)
95
 
96
  contract_id = f"Contract_{len(contract_data) + 1}"
 
114
  # Gradio UI
115
  with gr.Blocks(title="Contract Intelligence App") as demo:
116
  with gr.Row():
117
+ file_input = gr.File(type="binary", file_types=["pdf"], file_count="multiple", label="Upload Contracts")
118
  upload_progress = gr.Textbox(label="Progress", value="0/0", interactive=False)
119
 
120
  object_type = gr.Dropdown(choices=["Contract", "Agreement", "Invoice"], label="Select Object Type")
121
 
122
+ process_button = gr.Button("Process Contracts")
123
  status_output = gr.Textbox(label="Status", interactive=False)
124
  extracted_data_output = gr.JSON(label="Extracted Data")
125
  risks_output = gr.Textbox(label="Detected Risks", interactive=False)
126
 
127
+ def process_and_display(files, obj_type):
128
+ if not files:
129
+ return "❌ No files uploaded.", {}, "No risks detected", gr.update(value="0/0")
130
+ results = []
131
+ all_data = {}
132
+ all_risks = []
133
+ for file in files:
134
+ status, data, risks, _ = process_contract(file, obj_type)
135
+ results.append(f"{status} - File: {os.path.basename(file.decode() if isinstance(file, bytes) else file)}")
136
+ all_data.update({f"File_{len(all_data)}": data})
137
+ all_risks.extend(risks)
138
+ progress = f"{len(files)}/{len(files)}"
139
+ return "\n".join(results), all_data, "\n".join(all_risks) if all_risks else "No risks detected", gr.update(value=progress)
140
 
141
  process_button.click(
142
  fn=process_and_display,
 
149
  search_results = gr.JSON(label="Search Results")
150
  search_button = gr.Button("Search")
151
 
 
 
 
152
  search_button.click(
153
+ fn=search_contracts,
154
  inputs=search_query,
155
  outputs=search_results
156
  )