prudhviLatha commited on
Commit
04d8f7e
·
verified ·
1 Parent(s): 8c8e3d5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +514 -0
app.py ADDED
@@ -0,0 +1,514 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import logging
4
+ from logging.handlers import RotatingFileHandler
5
+ import re
6
+ from datetime import datetime
7
+ from dotenv import load_dotenv
8
+ from cryptography.fernet import Fernet
9
+ from simple_salesforce import Salesforce
10
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
11
+ from sentence_transformers import SentenceTransformer, util
12
+ from PIL import Image
13
+ import pytesseract
14
+ import pandas as pd
15
+ from docx import Document
16
+ import PyPDF2
17
+ import gradio as gr
18
+ from pdf2image import convert_from_path
19
+ import tempfile
20
+ from pytz import timezone
21
+ import shutil
22
+
23
+ # Setup logging with rotation
24
+ log_file = os.path.join(tempfile.gettempdir(), 'app.log')
25
+ handler = RotatingFileHandler(log_file, maxBytes=10*1024*1024, backupCount=5)
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ format='%(asctime)s - %(levelname)s - %(message)s',
29
+ handlers=[
30
+ handler,
31
+ logging.StreamHandler()
32
+ ]
33
+ )
34
+ logger = logging.getLogger(__name__)
35
+
36
+ # Check dependencies at startup
37
+ def check_dependencies():
38
+ missing_deps = []
39
+ try:
40
+ import pytesseract, pandas, openpyxl, xlrd, docx, PyPDF2, pdf2image
41
+ # Check Tesseract
42
+ try:
43
+ tesseract_path = shutil.which('tesseract')
44
+ if tesseract_path:
45
+ pytesseract.pytesseract.tesseract_cmd = tesseract_path
46
+ tesseract_version = pytesseract.get_tesseract_version()
47
+ logger.info(f"Tesseract found at {tesseract_path}, version: {tesseract_version}")
48
+ else:
49
+ logger.warning("Tesseract not found in PATH. Install with 'sudo apt install tesseract-ocr'. OCR-dependent files (JPEG, PNG, scanned PDFs) will not be processed.")
50
+ missing_deps.append("Tesseract")
51
+ except Exception as e:
52
+ logger.warning(f"Tesseract unavailable: {str(e)}. Install with 'sudo apt install tesseract-ocr'. OCR-dependent files (JPEG, PNG, scanned PDFs) will not be processed.")
53
+ missing_deps.append("Tesseract")
54
+ # Check Poppler
55
+ try:
56
+ poppler_path = shutil.which('pdfinfo')
57
+ if poppler_path:
58
+ logger.info(f"Poppler found at {poppler_path}")
59
+ else:
60
+ logger.warning("Poppler not found in PATH. Install with 'sudo apt install poppler-utils'. Scanned PDFs will fail.")
61
+ missing_deps.append("Poppler")
62
+ except Exception as e:
63
+ logger.warning(f"Poppler unavailable: {str(e)}. Install with 'sudo apt install poppler-utils'. Scanned PDFs will fail.")
64
+ missing_deps.append("Poppler")
65
+ logger.info("Required Python packages installed")
66
+ except ImportError as e:
67
+ logger.error(f"Missing Python dependency: {str(e)}. Install via pip.")
68
+ raise ImportError(f"Missing Python dependency: {str(e)}")
69
+ return missing_deps
70
+
71
+ missing_deps = check_dependencies()
72
+
73
+ # Load environment variables
74
+ load_dotenv()
75
+ required_env_vars = [
76
+ 'ENCRYPTION_KEY', 'SALESFORCE_USERNAME', 'SALESFORCE_PASSWORD',
77
+ 'SALESFORCE_SECURITY_TOKEN', 'SALESFORCE_DOMAIN'
78
+ ]
79
+ env = {var: os.getenv(var) for var in required_env_vars}
80
+ if missing := [k for k in required_env_vars if not env[k]]:
81
+ logger.error(f"Missing environment variables: {', '.join(missing)}")
82
+ raise ValueError(f"Missing environment variables: {', '.join(missing)}")
83
+
84
+ # Setup encryption
85
+ try:
86
+ fernet = Fernet(env['ENCRYPTION_KEY'].encode())
87
+ except Exception as e:
88
+ logger.error(f"Invalid encryption key: {e}")
89
+ raise ValueError(f"Invalid encryption key: {e}")
90
+
91
+ # Salesforce connection retry
92
+ def init_salesforce(max_retries=3, delay=3):
93
+ for attempt in range(max_retries):
94
+ try:
95
+ sf = Salesforce(
96
+ username=env['SALESFORCE_USERNAME'],
97
+ password=env['SALESFORCE_PASSWORD'],
98
+ security_token=env['SALESFORCE_SECURITY_TOKEN'],
99
+ domain=env['SALESFORCE_DOMAIN'],
100
+ version='58.0'
101
+ )
102
+ logger.info("Connected to Salesforce")
103
+ return sf
104
+ except Exception as e:
105
+ logger.error(f"Salesforce connection attempt {attempt + 1} failed: {str(e)}")
106
+ if attempt < max_retries - 1:
107
+ time.sleep(delay)
108
+ logger.error("Salesforce connection failed after retries")
109
+ raise ValueError("Salesforce connection failed after retries")
110
+
111
+ # Initialize models
112
+ def init_models():
113
+ try:
114
+ summarizer = pipeline(
115
+ "summarization",
116
+ model=AutoModelForSeq2SeqLM.from_pretrained("t5-base"),
117
+ tokenizer=AutoTokenizer.from_pretrained("t5-base")
118
+ )
119
+ sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
120
+ logger.info("NLP models initialized successfully")
121
+ return summarizer, sentence_model
122
+ except Exception as e:
123
+ logger.error(f"Model initialization failed: {str(e)}")
124
+ raise
125
+
126
+ # Clean text for better processing
127
+ def clean_text(text):
128
+ try:
129
+ if not text:
130
+ return ""
131
+ text = re.sub(r'\s+', ' ', text.strip()) # Normalize whitespace
132
+ text = re.sub(r'[^\x00-\x7F]+', ' ', text) # Remove non-ASCII
133
+ text = re.sub(r'\b\d+\b(?!\s*,\s*\d{4})', ' ', text) # Remove standalone numbers
134
+ return text
135
+ except Exception as e:
136
+ logger.error(f"Text cleaning failed: {str(e)}")
137
+ return ""
138
+
139
+ # Validate file readability
140
+ def validate_file(file_path):
141
+ try:
142
+ ext = os.path.splitext(file_path)[1].lower()
143
+ if ext not in ['.pdf', '.docx', '.png', '.jpg', '.jpeg', '.csv', '.xls', '.xlsx']:
144
+ return False, f"Unsupported file type: {ext}"
145
+ if not os.path.exists(file_path):
146
+ return False, f"File not found: {file_path}"
147
+ if os.path.getsize(file_path) == 0:
148
+ return False, f"File is empty: {file_path}"
149
+ return True, None
150
+ except Exception as e:
151
+ logger.error(f"File validation failed for {file_path}: {str(e)}")
152
+ return False, f"File validation failed: {str(e)}"
153
+
154
+ # Extract text from file
155
+ def extract_text(file_path):
156
+ is_valid, error = validate_file(file_path)
157
+ if not is_valid:
158
+ logger.error(error)
159
+ return None, error
160
+ ext = os.path.splitext(file_path)[1].lower()
161
+ try:
162
+ logger.debug(f"Extracting text from {file_path} (type: {ext})")
163
+ if ext == '.pdf':
164
+ with open(file_path, 'rb') as f:
165
+ pdf_reader = PyPDF2.PdfReader(f)
166
+ text = "".join([p.extract_text() or "" for p in pdf_reader.pages])
167
+ if not text or len(text.strip()) < 50:
168
+ logger.warning(f"PDF text extraction failed or too short, attempting OCR")
169
+ if 'Tesseract' in missing_deps or 'Poppler' in missing_deps:
170
+ return None, "OCR unavailable: Tesseract or Poppler not installed. Install with 'sudo apt install tesseract-ocr poppler-utils'."
171
+ try:
172
+ images = convert_from_path(file_path)
173
+ text = ""
174
+ for i, img in enumerate(images):
175
+ logger.debug(f"Processing page {i+1} for OCR")
176
+ img = img.convert('L') # Convert to grayscale
177
+ img = img.resize((img.width // 2, img.height // 2)) # Optimize size
178
+ text += pytesseract.image_to_string(img, config='--psm 6') + "\n"
179
+ except Exception as ocr_err:
180
+ logger.error(f"OCR failed: {str(ocr_err)}")
181
+ return None, f"OCR failed for {file_path}: {str(ocr_err)}"
182
+ elif ext == '.docx':
183
+ doc = Document(file_path)
184
+ text = "\n".join([p.text for p in doc.paragraphs if p.text])
185
+ for table in doc.tables:
186
+ for row in table.rows:
187
+ for cell in row.cells:
188
+ text += "\n" + cell.text
189
+ elif ext in ['.png', '.jpg', '.jpeg']:
190
+ if 'Tesseract' in missing_deps:
191
+ return None, "OCR unavailable: Tesseract not installed. Install with 'sudo apt install tesseract-ocr'."
192
+ try:
193
+ img = Image.open(file_path).convert('L')
194
+ img = img.resize((img.width // 2, img.height // 2)) # Optimize size
195
+ text = pytesseract.image_to_string(img, config='--psm 6')
196
+ except Exception as ocr_err:
197
+ logger.error(f"OCR failed for {file_path}: {str(ocr_err)}")
198
+ return None, f"OCR failed for {file_path}: {str(ocr_err)}"
199
+ elif ext in ['.csv', '.xls', '.xlsx']:
200
+ try:
201
+ df = pd.read_excel(file_path) if ext in ['.xls', '.xlsx'] else pd.read_csv(file_path)
202
+ logger.debug(f"Excel/CSV columns: {df.columns.tolist()}")
203
+ text = df.to_string(index=False)
204
+ except Exception as e:
205
+ logger.error(f"Excel/CSV processing failed for {file_path}: {str(e)}")
206
+ return None, f"Excel/CSV processing failed: {str(e)}"
207
+ text = clean_text(text)
208
+ if not text or len(text) < 50:
209
+ logger.error(f"Extracted text is empty or too short: {len(text)} characters")
210
+ return None, f"Text extraction failed: No valid text extracted from {file_path}"
211
+ logger.debug(f"Extracted text length: {len(text)} characters")
212
+ return text, None
213
+ except Exception as e:
214
+ logger.error(f"Text extraction failed for {file_path}: {str(e)}")
215
+ return None, f"Text extraction failed: {str(e)}"
216
+
217
+ # Parse dates with IST timezone
218
+ def parse_dates(text):
219
+ ist = timezone('Asia/Kolkata')
220
+ current_date = datetime.now(ist).replace(hour=18, minute=33, second=0, microsecond=0) # 06:33 PM IST, June 26, 2025
221
+ try:
222
+ date_patterns = [
223
+ r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:-|\s*,?\s*)\d{4}\b',
224
+ r'\b\d{1,2}/\d{1,2}/\d{4}\b',
225
+ r'\b\d{4}-\d{2}-\d{2}\b',
226
+ r'\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s*\d{4}\b',
227
+ r'\b\d{4}\b'
228
+ ]
229
+ dates = []
230
+ unparseable_dates = []
231
+ for pattern in date_patterns:
232
+ found_dates = re.findall(pattern, text, re.IGNORECASE)
233
+ dates.extend(found_dates)
234
+ if found_dates:
235
+ logger.debug(f"Found dates with pattern {pattern}: {found_dates}")
236
+ parsed_dates = []
237
+ for date in dates:
238
+ try:
239
+ if '/' in date:
240
+ parsed = datetime.strptime(date, '%m/%d/%Y').replace(tzinfo=ist)
241
+ elif '-' in date and len(date.split('-')) == 3:
242
+ parsed = datetime.strptime(date, '%Y-%m-%d').replace(tzinfo=ist)
243
+ elif re.match(r'\b\w+\s+\d{4}\b', date):
244
+ month_year = re.sub(r'\s+', ' ', date.strip())
245
+ parsed = datetime.strptime(month_year, '%b %Y').replace(day=1, tzinfo=ist)
246
+ elif ',' in date:
247
+ parsed = datetime.strptime(date, '%B %d, %Y').replace(tzinfo=ist)
248
+ else:
249
+ parsed = datetime.strptime(date, '%Y').replace(month=1, day=1, tzinfo=ist)
250
+ parsed_dates.append(parsed.strftime('%Y-%m-%d'))
251
+ except Exception as e:
252
+ logger.debug(f"Unparseable date '{date}': {str(e)}")
253
+ unparseable_dates.append(date)
254
+ if unparseable_dates:
255
+ logger.warning(f"Found {len(unparseable_dates)} unparseable dates: {unparseable_dates}")
256
+ if not parsed_dates:
257
+ logger.warning("No valid dates extracted, using current date")
258
+ parsed_dates.append(current_date.strftime('%Y-%m-%d'))
259
+ while len(parsed_dates) < 2:
260
+ parsed_dates.append(parsed_dates[0] if parsed_dates else current_date.strftime('%Y-%m-%d'))
261
+ logger.debug(f"Extracted {len(parsed_dates)} valid dates: {parsed_dates}")
262
+ return parsed_dates[:2]
263
+ except Exception as e:
264
+ logger.error(f"Date parsing failed: {str(e)}")
265
+ return [current_date.strftime('%Y-%m-%d'), current_date.strftime('%Y-%m-%d')]
266
+
267
+ # Summarize contract
268
+ def summarize_contract(text, summarizer, sentence_model):
269
+ aspects = ["parties", "payment terms", "obligations", "termination clauses"]
270
+ try:
271
+ if not text or len(text.strip()) < 50:
272
+ logger.error("Input text is empty or too short")
273
+ return {
274
+ "full_summary": "No summary generated due to insufficient text",
275
+ "aspect_summaries": {asp: "Not extracted" for asp in aspects},
276
+ "dates": parse_dates(text)
277
+ }, None
278
+ text = clean_text(text)[:4096]
279
+ try:
280
+ summary_result = summarizer(f"summarize: {text}", max_length=150, min_length=50, do_sample=False)[0]['summary_text']
281
+ if summary_result.strip() == text.strip()[:len(summary_result)]:
282
+ logger.warning("Summary identical to input, generating fallback")
283
+ summary_result = f"Summary: {text[:150]}..." if len(text) > 150 else text
284
+ logger.debug(f"Generated summary: {summary_result[:50]}...")
285
+ full_summary = summary_result
286
+ except Exception as e:
287
+ logger.error(f"Summarizer failed: {str(e)}")
288
+ full_summary = f"Summary failed: {text[:150]}..." if len(text) > 150 else text
289
+ aspect_summaries = {}
290
+ aspect_synonyms = {
291
+ "parties": ["contractor", "client", "party", "signatory", "entity"],
292
+ "payment terms": ["payment", "compensation", "fees", "billing", "invoicing"],
293
+ "obligations": ["duties", "responsibilities", "obligations", "commitments"],
294
+ "termination clauses": ["termination", "cancellation", "end of contract", "exit"]
295
+ }
296
+ if aspects:
297
+ sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip() and len(s.strip()) > 10]
298
+ if sentences:
299
+ logger.debug(f"Extracted {len(sentences)} sentences for aspect summarization")
300
+ emb = sentence_model.encode(sentences, convert_to_tensor=True)
301
+ for asp in aspects:
302
+ asp_texts = [asp] + aspect_synonyms.get(asp, [])
303
+ asp_embs = sentence_model.encode(asp_texts, convert_to_tensor=True)
304
+ sims = util.cos_sim(asp_embs, emb).max(dim=0).values
305
+ top = sims.argsort(descending=True)[:5]
306
+ asp_text = ". ".join([sentences[i] for i in top if sims[i] > 0.05])
307
+ if asp_text:
308
+ aspect_summaries[asp] = asp_text[:200]
309
+ logger.debug(f"Aspect '{asp}' matched {len([i for i in top if sims[i] > 0.05])} sentences")
310
+ else:
311
+ logger.warning(f"No sentences matched aspect '{asp}'")
312
+ aspect_summaries[asp] = "Not extracted"
313
+ else:
314
+ logger.warning("No valid sentences for aspect summarization")
315
+ for asp in aspects:
316
+ aspect_summaries[asp] = "Not extracted"
317
+ return {
318
+ "full_summary": full_summary,
319
+ "aspect_summaries": aspect_summaries,
320
+ "dates": parse_dates(text)
321
+ }, None
322
+ except Exception as e:
323
+ logger.error(f"Summarization failed: {str(e)}")
324
+ return {
325
+ "full_summary": f"Summary generation failed: {text[:150]}..." if len(text) > 150 else text,
326
+ "aspect_summaries": {asp: "Not extracted" for asp in aspects},
327
+ "dates": parse_dates(text)
328
+ }, None
329
+
330
+ # Create Contract Document record
331
+ def create_contract_document(sf, file_name, file_url=None):
332
+ ist = timezone('Asia/Kolkata')
333
+ current_time = datetime.now(ist).replace(hour=18, minute=33, second=0, microsecond=0) # 06:33 PM IST, June 26, 2025
334
+ try:
335
+ escaped_file_name = file_name.replace("'", "\\'")
336
+ today_datetime = current_time.strftime('%Y-%m-%dT%H:%M:%SZ')
337
+ query_datetime = f"SELECT Id, Upload_Date__c FROM Contract_Document__c WHERE Name = '{escaped_file_name}' AND Upload_Date__c = {today_datetime} LIMIT 1"
338
+ logger.debug(f"Executing SOQL query (dateTime): {query_datetime}")
339
+ try:
340
+ result = sf.query(query_datetime)
341
+ if result['totalSize'] > 0:
342
+ doc_id = result['records'][0]['Id']
343
+ logger.info(f"Contract Document exists for {file_name} on {today_datetime}, ID {doc_id}")
344
+ return doc_id, None
345
+ except Exception as e:
346
+ logger.warning(f"dateTime query failed: {str(e)}. Trying Date format.")
347
+ today_date = current_time.strftime('%Y-%m-%d')
348
+ query_date = f"SELECT Id, Upload_Date__c FROM Contract_Document__c WHERE Name = '{escaped_file_name}' AND Upload_Date__c = '{today_date}' LIMIT 1"
349
+ logger.debug(f"Executing SOQL query (Date): {query_date}")
350
+ result = sf.query(query_date)
351
+ if result['totalSize'] > 0:
352
+ doc_id = result['records'][0]['Id']
353
+ logger.info(f"Contract Document exists for {file_name} on {today_date}, ID {doc_id}")
354
+ return doc_id, None
355
+ record = {
356
+ 'Name': file_name,
357
+ 'Document_URL__c': file_url or '',
358
+ 'Upload_Date__c': today_datetime,
359
+ 'Status__c': 'Uploaded'
360
+ }
361
+ result = sf.Contract_Document__c.create(record)
362
+ logger.info(f"Created Contract Document for {file_name} with ID {result['id']}")
363
+ return result['id'], None
364
+ except Exception as e:
365
+ logger.error(f"Failed to create Contract Document for {file_name}: {str(e)}")
366
+ return None, f"Failed to create Contract Document: {str(e)}"
367
+
368
+ # Store summary in Salesforce
369
+ def store_in_salesforce(sf, summary_data, file_name, contract_document_id):
370
+ try:
371
+ query = f"SELECT Id FROM Contract_Summary__c WHERE Contract_Document__c = '{contract_document_id}' LIMIT 1"
372
+ logger.debug(f"Executing SOQL query: {query}")
373
+ result = sf.query(query)
374
+ if result['totalSize'] > 0:
375
+ logger.info(f"Summary exists for Contract Document ID {contract_document_id}, ID {result['records'][0]['Id']}")
376
+ return {'id': result['records'][0]['Id']}, None
377
+ encrypted_summary = fernet.encrypt(summary_data['full_summary'].encode()).decode()
378
+ def truncate(text, length=2000):
379
+ return text[:length] if text else 'Not extracted'
380
+ record = {
381
+ 'Name': file_name,
382
+ 'Contract_Document__c': contract_document_id,
383
+ 'Parties__c': truncate(summary_data['aspect_summaries'].get('parties', 'Not extracted')),
384
+ 'Payment_Terms__c': truncate(summary_data['aspect_summaries'].get('payment terms', 'Not extracted')),
385
+ 'Obligations__c': truncate(summary_data['aspect_summaries'].get('obligations', 'Not extracted')),
386
+ 'Termination_Clause__c': truncate(summary_data['aspect_summaries'].get('termination clauses', 'Not extracted')),
387
+ 'Custom_Field_1__c': encrypted_summary,
388
+ 'Validation_Status__c': 'Pending',
389
+ 'Start_Date__c': summary_data['dates'][0][:10] if summary_data['dates'] and len(summary_data['dates']) > 0 else None,
390
+ 'End_Date__c': summary_data['dates'][1][:10] if summary_data['dates'] and len(summary_data['dates']) > 1 else summary_data['dates'][0][:10] if summary_data['dates'] else None,
391
+ }
392
+ logger.debug(f"Record to be created: {record}")
393
+ if not any(record.get(field) not in ['', 'Not extracted'] for field in ['Parties__c', 'Payment_Terms__c', 'Obligations__c', 'Termination_Clause__c']):
394
+ logger.warning(f"No valid aspects extracted for {file_name}, storing with full summary only")
395
+ result = sf.Contract_Summary__c.create(record)
396
+ logger.info(f"Stored summary for {file_name} with ID {result['id']}")
397
+ return result, None
398
+ except Exception as e:
399
+ logger.error(f"Failed to store summary for {file_name}: {str(e)}")
400
+ return None, f"Failed to store in Salesforce: {str(e)}. Check {log_file}"
401
+
402
+ # Generate CSV report
403
+ def generate_report(sf, output_file, contract_document_id):
404
+ try:
405
+ query = (
406
+ f"SELECT Id, Name, Parties__c, Payment_Terms__c, Obligations__c, Termination_Clause__c, Custom_Field_1__c, "
407
+ f"Validation_Status__c, Start_Date__c, End_Date__c "
408
+ f"FROM Contract_Summary__c WHERE Contract_Document__c = '{contract_document_id}' LIMIT 1"
409
+ )
410
+ logger.debug(f"Executing SOQL query: {query}")
411
+ results = sf.query(query)['records']
412
+ logger.info(f"Retrieved {len(results)} records for Contract_Document__c ID {contract_document_id}")
413
+ rows = []
414
+ for r in results:
415
+ try:
416
+ decrypted_summary = fernet.decrypt(r.get('Custom_Field_1__c', '').encode()).decode() if r.get('Custom_Field_1__c') else 'Not extracted'
417
+ except Exception as e:
418
+ logger.error(f"Decryption failed for record {r.get('Id', 'unknown')}: {str(e)}")
419
+ decrypted_summary = 'Decryption failed'
420
+ row = {
421
+ 'Contract_Name': r.get('Name', 'Not extracted'),
422
+ 'Parties': r.get('Parties__c', 'Not extracted')[:50],
423
+ 'Payment_Terms': r.get('Payment_Terms__c', 'Not extracted')[:50],
424
+ 'Obligations': r.get('Obligations__c', 'Not extracted')[:50],
425
+ 'Termination_Clause': r.get('Termination_Clause__c', 'Not extracted')[:50],
426
+ 'Full_Summary': decrypted_summary[:100],
427
+ 'Validation_Status': r.get('Validation_Status__c', 'Not extracted'),
428
+ 'Start_Date': r.get('Start_Date__c', 'Not extracted'),
429
+ 'End_Date': r.get('End_Date__c', 'Not extracted'),
430
+ }
431
+ rows.append(row)
432
+ if not rows:
433
+ logger.warning(f"No summary found for Contract_Document__c ID {contract_document_id}")
434
+ return pd.DataFrame(columns=['Contract_Name', 'Parties', 'Payment_Terms', 'Obligations', 'Termination_Clause', 'Full_Summary', 'Validation_Status', 'Start_Date', 'End_Date']), None
435
+ df = pd.DataFrame(rows)
436
+ logger.info(f"Generated DataFrame with {len(df)} record(s) for {contract_document_id}")
437
+ df.to_csv(output_file, index=False, encoding='utf-8')
438
+ logger.info(f"Saved report to {output_file}")
439
+ return df, output_file
440
+ except Exception as e:
441
+ logger.error(f"Report generation failed: {str(e)}")
442
+ return pd.DataFrame(columns=['Contract_Name', 'Parties', 'Payment_Terms', 'Obligations', 'Termination_Clause', 'Full_Summary', 'Validation_Status', 'Start_Date', 'End_Date']), None
443
+
444
+ # Gradio interface function
445
+ def gradio_process(file, progress=gr.Progress()):
446
+ try:
447
+ if not file:
448
+ logger.error("No file uploaded")
449
+ return "Error: No file uploaded.", pd.DataFrame(), None
450
+ file_path = file.name if hasattr(file, 'name') else file
451
+ file_name = os.path.basename(file_path)
452
+ progress(0.1, desc="Validating file...")
453
+ is_valid, error = validate_file(file_path)
454
+ if not is_valid:
455
+ logger.error(error)
456
+ return f"Error: {error}", pd.DataFrame(), None
457
+ progress(0.2, desc="Extracting text...")
458
+ text, error = extract_text(file_path)
459
+ if error:
460
+ logger.error(f"Text extraction failed: {error}")
461
+ return f"Error extracting text from {file_name}: {error}. Check {log_file}", pd.DataFrame(), None
462
+ progress(0.4, desc="Initializing Salesforce and models...")
463
+ sf = init_salesforce()
464
+ summarizer, sentence_model = init_models()
465
+ progress(0.6, desc="Summarizing contract...")
466
+ summary_data, err = summarize_contract(text, summarizer, sentence_model)
467
+ if err:
468
+ logger.error(f"Summarization failed: {err}")
469
+ return f"Error summarizing {file_name}: {err}. Check {log_file}", pd.DataFrame(), None
470
+ progress(0.8, desc="Storing data in Salesforce...")
471
+ contract_doc_id, err = create_contract_document(sf, file_name)
472
+ if err:
473
+ logger.error(f"Contract document creation failed: {err}")
474
+ return f"Error creating Contract Document for {file_name}: {err}. Check {log_file}", pd.DataFrame(), None
475
+ result, err = store_in_salesforce(sf, summary_data, file_name, contract_doc_id)
476
+ if err:
477
+ logger.error(f"Salesforce storage failed: {err}")
478
+ return f"Error storing summary for {file_name}: {err}. Check {log_file}", pd.DataFrame(), None
479
+ progress(0.9, desc="Generating report...")
480
+ csv_path = os.path.join(tempfile.gettempdir(), f"contract_summary_{file_name}.csv")
481
+ report_df, csv_path = generate_report(sf, csv_path, contract_doc_id)
482
+ if report_df.empty:
483
+ logger.warning(f"No valid report data generated for {file_name}")
484
+ return f"Success! Summary stored for {file_name} with ID {result['id']}. No report data.", pd.DataFrame(), None
485
+ progress(1.0, desc="Complete!")
486
+ return (
487
+ f"Success! Summary stored for {file_name} with ID {result['id']}. Report generated.",
488
+ report_df,
489
+ csv_path
490
+ )
491
+ except Exception as e:
492
+ logger.error(f"Processing error for {file_name if 'file_name' in locals() else 'unknown file'}: {str(e)}")
493
+ return f"Error processing {file_name if 'file_name' in locals() else 'file'}: {str(e)}. Check {log_file}", pd.DataFrame(), None
494
+
495
+ # Gradio UI setup
496
+ with gr.Blocks(title="AI-Powered Contract Summarizer with Salesforce Integration") as iface:
497
+ gr.Markdown("AI Contract Summarizer")
498
+ with gr.Row():
499
+ file_input = gr.File(label="Upload Contract File (PDF, DOCX, PNG, JPG, CSV, XLS/XLSX)")
500
+ submit_btn = gr.Button("Submit", elem_classes=["bg-orange-500"])
501
+ result_output = gr.Textbox(label="Result", lines=5)
502
+ report_output = gr.DataFrame(label="Contract Summary Report", headers=['Contract_Name', 'Parties', 'Payment_Terms', 'Obligations', 'Termination_Clause', 'Full_Summary', 'Validation_Status', 'Start_Date', 'End_Date'], interactive=False)
503
+ csv_output = gr.File(label="Download CSV Report")
504
+ submit_btn.click(
505
+ fn=gradio_process,
506
+ inputs=[file_input],
507
+ outputs=[result_output, report_output, csv_output]
508
+ )
509
+
510
+ if __name__ == "__main__":
511
+ logger.info(f"Starting Gradio interface. Logs saved to {log_file}")
512
+ if missing_deps:
513
+ logger.warning(f"Application running with limited functionality due to missing dependencies: {', '.join(missing_deps)}")
514
+ iface.launch()