Abhisesh7 commited on
Commit
23ace93
·
verified ·
1 Parent(s): 7d688eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -313
app.py CHANGED
@@ -1,248 +1,15 @@
1
  import os
2
- from dotenv import load_dotenv
3
  import logging
4
- import pandas as pd
5
- import numpy as np
6
- from sklearn.ensemble import IsolationForest
7
- from sklearn.preprocessing import StandardScaler
8
- import uuid
9
- from datetime import datetime, timedelta
10
- import re
11
  import gradio as gr
12
- import time
13
- from simple_salesforce import Salesforce, SalesforceAuthenticationFailed
14
  from pdf_extraction import extract_text_from_pdf
15
  from image_extraction import extract_text_from_image
16
 
17
- # Load environment variables from .env file
18
- load_dotenv()
19
-
20
- # Configure environment for CPU usage
21
- os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable GPU usage
22
- os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" # Disable oneDNN optimizations
23
-
24
  # Set up logging
25
  logging.basicConfig(level=logging.DEBUG) # Set to DEBUG for detailed logging
26
  logger = logging.getLogger(__name__)
27
 
28
- # Read Salesforce credentials from environment variables
29
- SF_USERNAME = os.getenv("SF_USERNAME")
30
- SF_PASSWORD = os.getenv("SF_PASSWORD")
31
- SF_SECURITY_TOKEN = os.getenv("SF_SECURITY_TOKEN")
32
-
33
- logger.info(f"Salesforce login info: username={SF_USERNAME}")
34
-
35
- # Salesforce connection with error handling
36
- try:
37
- sf = Salesforce(
38
- username=SF_USERNAME,
39
- password=SF_PASSWORD,
40
- security_token=SF_SECURITY_TOKEN
41
- )
42
- logger.info("Salesforce login successful.")
43
- except SalesforceAuthenticationFailed as e:
44
- logger.error(f"Salesforce authentication failed: {e}")
45
- sf = None
46
-
47
- def extract_basic_info(text):
48
- """Extract minimal information for fraud detection without altering the raw text."""
49
- invoice_number = "Unknown"
50
- vendor_name = "Unknown"
51
- invoice_date = datetime.now().date()
52
- total_amount = 0.0
53
-
54
- # Minimal regex patterns for fraud detection
55
- invoice_num_pattern = r"(?:invoice\s*(?:number|no\.?|#)|order\s*(?:number|no\.?))\s*[:\-\s#]*([\w-]+)|(?:inv-|ord-)([\w-]+)"
56
- vendor_pattern = r"(?:vendor\s*(?:name|company)?|supplier|company\s*name|from|sold\s*by|to)\s*[:\-\s]*([A-Za-z\s&\.\-]+)(?=\s*(?:address|invoice\s*(?:no|number)|date|phone|email|\n|$))"
57
- invoice_date_pattern = r"(?:invoice\s*date|date|issue\s*date|order\s*date)\s*[:\-\s]*(\d{4}-\d{2}-\d{2}|\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4}|\d{2}\s*[A-Za-z]+\s*\d{4}|[A-Za-z]+\s*\d{1,2}(?:st|nd|rd|th)?\s*,?\s*\d{4}|\d{1,2}\s*[A-Za-z]+\s*\d{4})"
58
- total_amount_pattern = r"(?:total\s*(?:amount|due)?|amount\s*due|total|grand\s*total)\s*[:\-\s]*[$₹€]?\s*([\d,]+\.?\d*)\s*(?:usd|inr|gbp|eur)?"
59
-
60
- # Invoice Number
61
- invoice_num_match = re.search(invoice_num_pattern, text, re.IGNORECASE)
62
- if invoice_num_match:
63
- invoice_number = invoice_num_match.group(1) if invoice_num_match.group(1) else invoice_num_match.group(2)
64
- logger.info(f"Matched Invoice Number: {invoice_number}")
65
-
66
- # Vendor Name
67
- vendor_match = re.search(vendor_pattern, text, re.IGNORECASE)
68
- if vendor_match:
69
- vendor_name = vendor_match.group(1).strip()
70
- logger.info(f"Matched Vendor Name: {vendor_name}")
71
-
72
- # Invoice Date
73
- invoice_date_match = re.search(invoice_date_pattern, text, re.IGNORECASE)
74
- if invoice_date_match:
75
- date_str = invoice_date_match.group(1)
76
- try:
77
- if "/" in date_str:
78
- invoice_date = datetime.strptime(date_str, "%m/%d/%Y").date()
79
- elif "," in date_str:
80
- date_str = re.sub(r'(st|nd|rd|th)', '', date_str)
81
- invoice_date = datetime.strptime(date_str, "%B %d, %Y").date()
82
- elif "-" in date_str:
83
- try:
84
- invoice_date = datetime.strptime(date_str, "%Y-%m-%d").date()
85
- except ValueError:
86
- invoice_date = datetime.strptime(date_str, "%d-%m-%Y").date()
87
- else:
88
- date_str = re.sub(r'(st|nd|rd|th)', '', date_str)
89
- invoice_date = datetime.strptime(date_str, "%d %B %Y").date()
90
- logger.info(f"Matched Invoice Date: {invoice_date}")
91
- except ValueError as e:
92
- logger.warning(f"Failed to parse Invoice Date '{date_str}': {str(e)}")
93
-
94
- # Total Amount
95
- total_amount_match = re.search(total_amount_pattern, text, re.IGNORECASE)
96
- if total_amount_match:
97
- total_amount = float(total_amount_match.group(1).replace(",", ""))
98
- logger.info(f"Matched Total Amount: {total_amount}")
99
-
100
- return invoice_number, vendor_name, invoice_date, total_amount
101
-
102
- def fetch_vendor_history(vendor_name, invoice_number, time_window_days=30):
103
- """Fetch historical invoices for the vendor from Salesforce with retry logic."""
104
- if sf is None:
105
- logger.warning("Salesforce client not initialized. Skipping vendor history fetch.")
106
- return pd.DataFrame()
107
-
108
- max_retries = 3
109
- retry_delay = 5 # seconds
110
-
111
- for attempt in range(1, max_retries + 1):
112
- try:
113
- end_date = datetime.now().date()
114
- start_date = end_date - timedelta(days=time_window_days)
115
-
116
- query = f"""
117
- SELECT Invoice_Number__c, Invoice_Amount__c, Invoice_Date__c, Vendor_Name__c
118
- FROM Invoice_Record__c
119
- WHERE Invoice_Date__c >= {start_date} AND Invoice_Date__c <= {end_date}
120
- AND Vendor_Name__c = '{vendor_name}'
121
- LIMIT 100
122
- """
123
- logger.info(f"Fetching vendor history for {vendor_name} (Attempt {attempt}/{max_retries})...")
124
- result = sf.query(query)
125
- records = result['records']
126
-
127
- history_df = pd.DataFrame(records)
128
- if not history_df.empty:
129
- history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c']).dt.date
130
- logger.info(f"Successfully fetched vendor history for {vendor_name}.")
131
- return history_df
132
- except Exception as e:
133
- logger.warning(f"Failed to fetch vendor history (Attempt {attempt}/{max_retries}): {str(e)}")
134
- if attempt < max_retries:
135
- logger.info(f"Retrying in {retry_delay} seconds...")
136
- time.sleep(retry_delay)
137
- else:
138
- logger.error(f"Failed to fetch vendor history after all retries: {str(e)}")
139
- return pd.DataFrame()
140
-
141
- def check_data_consistency(invoice_number, vendor_name, invoice_date, history_df):
142
- """Check for data consistency issues like duplicates."""
143
- consistency_issues = []
144
-
145
- if not history_df.empty:
146
- duplicate_invoices = history_df[history_df['Invoice_Number__c'] == invoice_number]
147
- if not duplicate_invoices.empty:
148
- consistency_issues.append(f"Duplicate invoice number '{invoice_number}' found for vendor '{vendor_name}'.")
149
-
150
- return consistency_issues
151
-
152
- def detect_anomalies(df, history_df):
153
- """Detect anomalies in amount, frequency, and vendor patterns."""
154
- df["is_amount_anomaly"] = 0
155
- df["is_frequency_anomaly"] = 0
156
- df["is_vendor_pattern_anomaly"] = 0
157
-
158
- if not df.empty:
159
- scaler = StandardScaler()
160
- X_scaled = scaler.fit_transform(df[["amount"]])
161
- model = IsolationForest(contamination=0.05, random_state=42)
162
- df["is_amount_anomaly"] = model.fit_predict(X_scaled)
163
-
164
- if not history_df.empty:
165
- history_df['Invoice_Date__c'] = pd.to_datetime(history_df['Invoice_Date__c'])
166
- date_range = (history_df['Invoice_Date__c'].max() - history_df['Invoice_Date__c'].min()).days + 1
167
- frequency = len(history_df) / max(date_range, 1)
168
-
169
- date_diffs = [(d - history_df['Invoice_Date__c'].min()).days for d in history_df['Invoice_Date__c']]
170
- date_clustering = np.std(date_diffs) if len(date_diffs) > 1 else 0
171
-
172
- frequency_df = pd.DataFrame({
173
- "frequency": [frequency],
174
- "date_clustering": [date_clustering]
175
- })
176
- scaler = StandardScaler()
177
- X_scaled = scaler.fit_transform(frequency_df[["frequency", "date_clustering"]])
178
- model = IsolationForest(contamination=0.05, random_state=42)
179
- df["is_frequency_anomaly"] = model.fit_predict(X_scaled)[0]
180
- else:
181
- df["is_frequency_anomaly"] = 1
182
-
183
- if not history_df.empty and len(history_df) > 1:
184
- historical_amounts = history_df["Invoice_Amount__c"].astype(float)
185
- mean_amount = historical_amounts.mean()
186
- std_amount = historical_amounts.std() if len(historical_amounts) > 1 else 1
187
- amount_variance = historical_amounts.var() if len(historical_amounts) > 1 else 0
188
-
189
- current_amount = df["amount"].iloc[0]
190
- deviation = abs(current_amount - mean_amount) / (std_amount if std_amount > 0 else 1)
191
- invoice_count = len(history_df)
192
-
193
- vendor_pattern_df = pd.DataFrame({
194
- "amount_deviation": [deviation],
195
- "invoice_count": [invoice_count],
196
- "amount_variance": [amount_variance]
197
- })
198
- scaler = StandardScaler()
199
- X_scaled = scaler.fit_transform(vendor_pattern_df[["amount_deviation", "invoice_count", "amount_variance"]])
200
- model = IsolationForest(contamination=0.05, random_state=42)
201
- df["is_vendor_pattern_anomaly"] = model.fit_predict(X_scaled)[0]
202
- else:
203
- df["is_vendor_pattern_anomaly"] = 1
204
-
205
- return df
206
-
207
- def calculate_fraud_score(amount, is_amount_anomaly, is_frequency_anomaly, is_vendor_pattern_anomaly, text_length, consistency_issues, invoice_date):
208
- """Calculate fraud score based on amount, anomalies, text length, consistency issues, and invoice date."""
209
- score = 0.0
210
- reasoning = []
211
- today = datetime.now().date()
212
-
213
- if amount > 5000:
214
- score += 40
215
- reasoning.append("High invoice amount detected.")
216
- elif amount < 10:
217
- score += 20
218
- reasoning.append("Unusually low invoice amount.")
219
-
220
- if invoice_date > today:
221
- score += 10
222
- reasoning.append("Invoice date is in the future.")
223
-
224
- if is_amount_anomaly == -1:
225
- score += 30
226
- reasoning.append("Amount flagged as an anomaly.")
227
- if is_frequency_anomaly == -1:
228
- score += 25
229
- reasoning.append("Unusual invoice submission frequency or clustering detected.")
230
- if is_vendor_pattern_anomaly == -1:
231
- score += 25
232
- reasoning.append("Unusual vendor pattern detected (amount deviation, frequency, or variance).")
233
-
234
- if text_length > 500:
235
- score += 10
236
- reasoning.append("Excessive text length in invoice.")
237
-
238
- if consistency_issues:
239
- score += 15 * len(consistency_issues)
240
- reasoning.extend(consistency_issues)
241
-
242
- return min(score, 100), reasoning
243
-
244
  def process_invoice(file):
245
- """Process a single invoice (PDF or image) and return the raw extracted text with fraud detection."""
246
  # Determine file type and extract text accordingly
247
  file_extension = os.path.splitext(file.name)[1].lower()
248
  if file_extension == '.pdf':
@@ -250,98 +17,26 @@ def process_invoice(file):
250
  elif file_extension in ['.png', '.jpg', '.jpeg']:
251
  text = extract_text_from_image(file.name)
252
  else:
253
- return f"**Error**: Unsupported file type '{file_extension}'. Please upload a PDF, PNG, or JPG file."
254
 
255
  if "Error" in text:
256
- return f"**Error**: {text}"
257
-
258
- # Extract basic info for fraud detection
259
- invoice_number, vendor_name, invoice_date, total_amount = extract_basic_info(text)
260
- text_length = len(text)
261
-
262
- history_df = fetch_vendor_history(vendor_name, invoice_number)
263
- consistency_issues = check_data_consistency(invoice_number, vendor_name, invoice_date, history_df)
264
-
265
- data = {
266
- "invoice_id": str(uuid.uuid4()),
267
- "invoice_number": invoice_number,
268
- "vendor_name": vendor_name,
269
- "amount": total_amount,
270
- "invoice_date": invoice_date,
271
- "text_length": text_length
272
- }
273
- df = pd.DataFrame([data])
274
-
275
- df = detect_anomalies(df, history_df)
276
-
277
- fraud_score, fraud_reasoning = calculate_fraud_score(
278
- df["amount"].iloc[0],
279
- df["is_amount_anomaly"].iloc[0],
280
- df["is_frequency_anomaly"].iloc[0],
281
- df["is_vendor_pattern_anomaly"].iloc[0],
282
- text_length,
283
- consistency_issues,
284
- invoice_date
285
- )
286
-
287
- # Prepare the output with raw text only
288
- output = [
289
- "## Raw Extracted Text",
290
- "```",
291
- text,
292
- "```",
293
- "",
294
- "## Fraud Detection Summary",
295
- f"- **Invoice Number**: {invoice_number}",
296
- f"- **Vendor Name**: {vendor_name}",
297
- f"- **Invoice Date**: {invoice_date}",
298
- f"- **Invoice Amount**: ${total_amount:,.2f}",
299
- f"- **Fraud Score**: {fraud_score}",
300
- f"- **Status**: {'Flagged' if fraud_score > 50 else 'Cleared'}",
301
- f"- **Flagged**: {fraud_score > 50}",
302
- "",
303
- "## Fraud Reasoning"
304
- ]
305
-
306
- if fraud_reasoning:
307
- output.extend([f"- {reason}" for reason in fraud_reasoning])
308
- else:
309
- output.append("- No specific fraud indicators detected")
310
-
311
- if sf is not None:
312
- try:
313
- record_data = {
314
- "Invoice_Number__c": invoice_number,
315
- "Vendor_Name__c": vendor_name,
316
- "Invoice_Amount__c": total_amount,
317
- "Invoice_Date__c": str(invoice_date),
318
- "Fraud_Score__c": fraud_score,
319
- "Fraud_Reason__c": "; ".join(fraud_reasoning),
320
- "Flagged__c": fraud_score > 50,
321
- "Status__c": "Flagged" if fraud_score > 50 else "Cleared",
322
- "Items_Selected__c": "Not extracted" # Since we're not parsing items
323
- }
324
- logger.debug(f"Record data being sent to Salesforce: {record_data}")
325
- sf.Invoice_Record__c.create(record_data)
326
- logger.info("Successfully created Salesforce record.")
327
- except Exception as e:
328
- logger.error(f"Failed to create Salesforce record: {str(e)}")
329
- pass
330
 
331
- return "\n".join(output)
 
332
 
333
  def gradio_interface(file):
334
- """Gradio interface to process uploaded file (PDF or image) and display raw text with fraud detection."""
335
  if file is None:
336
  return "Please upload a PDF or image file."
337
  result = process_invoice(file)
338
  return result
339
 
340
  with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
341
- gr.Markdown("# Invoice Fraud Detection")
342
  with gr.Row():
343
  file_input = gr.File(label="Upload Invoice (PDF, PNG, JPG)")
344
- result_output = gr.Markdown(label="Fraud Detection Results")
345
  file_input.change(fn=gradio_interface, inputs=file_input, outputs=result_output)
346
 
347
  if __name__ == "__main__":
 
1
  import os
 
2
  import logging
 
 
 
 
 
 
 
3
  import gradio as gr
 
 
4
  from pdf_extraction import extract_text_from_pdf
5
  from image_extraction import extract_text_from_image
6
 
 
 
 
 
 
 
 
7
  # Set up logging
8
  logging.basicConfig(level=logging.DEBUG) # Set to DEBUG for detailed logging
9
  logger = logging.getLogger(__name__)
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def process_invoice(file):
12
+ """Extract text from a single invoice (PDF or image) and return it as is."""
13
  # Determine file type and extract text accordingly
14
  file_extension = os.path.splitext(file.name)[1].lower()
15
  if file_extension == '.pdf':
 
17
  elif file_extension in ['.png', '.jpg', '.jpeg']:
18
  text = extract_text_from_image(file.name)
19
  else:
20
+ return f"Error: Unsupported file type '{file_extension}'. Please upload a PDF, PNG, or JPG file."
21
 
22
  if "Error" in text:
23
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # Return the raw extracted text without any additional formatting
26
+ return text
27
 
28
  def gradio_interface(file):
29
+ """Gradio interface to process uploaded file (PDF or image) and display raw text."""
30
  if file is None:
31
  return "Please upload a PDF or image file."
32
  result = process_invoice(file)
33
  return result
34
 
35
  with gr.Blocks(css=".prose a[href*='share']:has(svg) {display:none !important;}") as iface:
36
+ gr.Markdown("# Invoice Text Extraction")
37
  with gr.Row():
38
  file_input = gr.File(label="Upload Invoice (PDF, PNG, JPG)")
39
+ result_output = gr.Textbox(label="Extracted Text")
40
  file_input.change(fn=gradio_interface, inputs=file_input, outputs=result_output)
41
 
42
  if __name__ == "__main__":