prernajeet01 commited on
Commit
7111a0a
·
verified ·
1 Parent(s): a7e67ed

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +334 -0
app.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+ import gradio as gr
7
+ import plotly.express as px
8
+ import plotly.graph_objects as go
9
+ from sklearn.ensemble import IsolationForest
10
+ from sklearn.preprocessing import StandardScaler
11
+ import openai
12
+ from datetime import datetime, timedelta
13
+ import json
14
+
15
+ # Set OpenAI API key from Hugging Face Spaces secrets
16
+ openai.api_key = os.environ.get("OPENAI_API_KEY")
17
+
18
+ def analyze_transaction_with_ai(transaction_data, suspicious_transactions):
19
+ """Use OpenAI to analyze suspicious transactions and provide insights"""
20
+ if not openai.api_key:
21
+ return "OpenAI API key not found. Please add it to the Hugging Face Spaces secrets."
22
+
23
+ try:
24
+ # Prepare information for OpenAI
25
+ suspicious_sample = suspicious_transactions.head(5).to_dict(orient='records')
26
+
27
+ # Get summary statistics
28
+ summary_stats = {
29
+ "total_transactions": len(transaction_data),
30
+ "flagged_transactions": len(suspicious_transactions),
31
+ "flagged_percentage": round(len(suspicious_transactions) / len(transaction_data) * 100, 2),
32
+ "avg_transaction_amount": round(transaction_data['amount'].mean(), 2),
33
+ "suspicious_avg_amount": round(suspicious_transactions['amount'].mean(), 2)
34
+ }
35
+
36
+ # Create prompt for OpenAI
37
+ prompt = f"""
38
+ Analyze these potentially fraudulent transactions and identify patterns or anomalies:
39
+
40
+ Transaction Data Summary:
41
+ {json.dumps(summary_stats)}
42
+
43
+ Sample of Suspicious Transactions:
44
+ {json.dumps(suspicious_sample)}
45
+
46
+ Provide a concise fraud analysis report with:
47
+ 1. Key patterns and red flags in these transactions
48
+ 2. Possible fraud scenarios explaining the anomalies
49
+ 3. Recommended next steps for investigation
50
+ """
51
+
52
+ # Call OpenAI API
53
+ response = openai.chat.completions.create(
54
+ model="gpt-3.5-turbo",
55
+ messages=[
56
+ {"role": "system", "content": "You are a fraud detection expert helping analyze suspicious financial transactions."},
57
+ {"role": "user", "content": prompt}
58
+ ],
59
+ max_tokens=800
60
+ )
61
+
62
+ # Return the AI analysis
63
+ return response.choices[0].message.content
64
+
65
+ except Exception as e:
66
+ return f"Error in AI analysis: {str(e)}"
67
+
68
+ def load_and_preprocess_data(file):
69
+ """Load and preprocess transaction data from CSV or Excel file"""
70
+ if file is None:
71
+ return None
72
+
73
+ # Get file extension
74
+ file_extension = os.path.splitext(file.name)[1].lower()
75
+
76
+ # Read file based on extension
77
+ if file_extension == '.csv':
78
+ df = pd.read_csv(file.name)
79
+ elif file_extension in ['.xlsx', '.xls']:
80
+ df = pd.read_excel(file.name)
81
+ else:
82
+ raise ValueError("Unsupported file format. Please upload a CSV or Excel file.")
83
+
84
+ # Check if the DataFrame is empty
85
+ if df.empty:
86
+ raise ValueError("The uploaded file is empty.")
87
+
88
+ # Check for essential columns
89
+ required_columns = ['transaction_id', 'amount', 'timestamp']
90
+ missing_columns = [col for col in required_columns if col not in df.columns]
91
+
92
+ if missing_columns:
93
+ # Try to identify columns that might contain the missing information
94
+ if 'transaction_id' in missing_columns and any(col.lower().endswith('id') for col in df.columns):
95
+ potential_id_columns = [col for col in df.columns if col.lower().endswith('id')]
96
+ if potential_id_columns:
97
+ df['transaction_id'] = df[potential_id_columns[0]]
98
+ missing_columns.remove('transaction_id')
99
+
100
+ if 'amount' in missing_columns and any(col.lower() in ['value', 'sum', 'total', 'price'] for col in df.columns):
101
+ potential_amount_columns = [col for col in df.columns if col.lower() in ['value', 'sum', 'total', 'price']]
102
+ if potential_amount_columns:
103
+ df['amount'] = df[potential_amount_columns[0]]
104
+ missing_columns.remove('amount')
105
+
106
+ if 'timestamp' in missing_columns and any(col.lower() in ['date', 'time', 'datetime'] for col in df.columns):
107
+ potential_time_columns = [col for col in df.columns if col.lower() in ['date', 'time', 'datetime']]
108
+ if potential_time_columns:
109
+ df['timestamp'] = df[potential_time_columns[0]]
110
+ missing_columns.remove('timestamp')
111
+
112
+ # If still missing required columns, raise error
113
+ if missing_columns:
114
+ raise ValueError(f"Missing required columns: {', '.join(missing_columns)}. Please ensure your data includes columns for transaction ID, amount, and timestamp.")
115
+
116
+ # Convert timestamp to datetime if it's not already
117
+ if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
118
+ try:
119
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
120
+ except:
121
+ raise ValueError("Could not convert timestamp column to datetime format.")
122
+
123
+ # Ensure amount is numeric
124
+ try:
125
+ df['amount'] = pd.to_numeric(df['amount'])
126
+ except:
127
+ raise ValueError("Could not convert amount column to numeric values.")
128
+
129
+ return df
130
+
131
+ def detect_fraud_and_anomalies(df):
132
+ """Detect fraud and anomalies in transaction data"""
133
+ # Create feature set for anomaly detection
134
+ features = df[['amount']].copy()
135
+
136
+ # Add time-based features if available
137
+ if 'timestamp' in df.columns:
138
+ features['hour_of_day'] = df['timestamp'].dt.hour
139
+ features['day_of_week'] = df['timestamp'].dt.dayofweek
140
+
141
+ # Add other relevant features if available
142
+ if 'location' in df.columns:
143
+ # One-hot encode location
144
+ location_dummies = pd.get_dummies(df['location'], prefix='location')
145
+ features = pd.concat([features, location_dummies], axis=1)
146
+
147
+ # Standardize features
148
+ scaler = StandardScaler()
149
+ scaled_features = scaler.fit_transform(features)
150
+
151
+ # Apply Isolation Forest for anomaly detection
152
+ clf = IsolationForest(contamination=0.05, random_state=42)
153
+ df['anomaly_score'] = clf.fit_predict(scaled_features)
154
+
155
+ # Flag anomalies (anomaly_score of -1 indicates an anomaly)
156
+ df['is_anomaly'] = df['anomaly_score'] == -1
157
+
158
+ # Additional heuristic rules for fraud detection
159
+ # 1. Unusually large transactions
160
+ amount_threshold = df['amount'].quantile(0.95)
161
+ df['high_amount'] = df['amount'] > amount_threshold
162
+
163
+ # 2. Transactions occurring at unusual hours (if timestamp available)
164
+ if 'timestamp' in df.columns:
165
+ df['unusual_hour'] = df['timestamp'].dt.hour.isin([0, 1, 2, 3, 4])
166
+ else:
167
+ df['unusual_hour'] = False
168
+
169
+ # 3. Calculate transaction frequency by user or account (if available)
170
+ if 'user_id' in df.columns or 'account_id' in df.columns:
171
+ id_col = 'user_id' if 'user_id' in df.columns else 'account_id'
172
+ transaction_counts = df.groupby(id_col).size().reset_index(name='transaction_count')
173
+ df = df.merge(transaction_counts, on=id_col)
174
+ df['high_frequency'] = df['transaction_count'] > df['transaction_count'].quantile(0.9)
175
+ else:
176
+ df['high_frequency'] = False
177
+
178
+ # 4. Velocity check: multiple transactions in short time period
179
+ if 'timestamp' in df.columns and ('user_id' in df.columns or 'account_id' in df.columns):
180
+ id_col = 'user_id' if 'user_id' in df.columns else 'account_id'
181
+ df = df.sort_values([id_col, 'timestamp'])
182
+ df['time_diff'] = df.groupby(id_col)['timestamp'].diff().dt.total_seconds().fillna(0)
183
+ df['rapid_succession'] = df['time_diff'] < 300 # Less than 5 minutes
184
+ else:
185
+ df['rapid_succession'] = False
186
+
187
+ # Combine all fraud indicators
188
+ df['fraud_score'] = (
189
+ df['is_anomaly'].astype(int) * 3 + # Weighted more heavily
190
+ df['high_amount'].astype(int) * 2 +
191
+ df['unusual_hour'].astype(int) +
192
+ df['high_frequency'].astype(int) +
193
+ df['rapid_succession'].astype(int)
194
+ )
195
+
196
+ # Flag as suspicious if fraud score is above threshold
197
+ df['is_suspicious'] = df['fraud_score'] >= 3
198
+
199
+ return df
200
+
201
+ def create_visualizations(df):
202
+ """Create visualizations for transaction data and anomalies"""
203
+ visualizations = {}
204
+
205
+ # 1. Distribution of transaction amounts with anomalies highlighted
206
+ fig1 = px.histogram(
207
+ df, x='amount', color='is_suspicious',
208
+ color_discrete_map={True: 'red', False: 'blue'},
209
+ title='Distribution of Transaction Amounts',
210
+ labels={'amount': 'Transaction Amount', 'is_suspicious': 'Suspicious'}
211
+ )
212
+ visualizations['amount_distribution'] = fig1
213
+
214
+ # 2. Time series of transaction amounts
215
+ if 'timestamp' in df.columns:
216
+ fig2 = px.scatter(
217
+ df, x='timestamp', y='amount', color='is_suspicious',
218
+ color_discrete_map={True: 'red', False: 'blue'},
219
+ title='Transaction Amounts Over Time',
220
+ labels={'amount': 'Transaction Amount', 'timestamp': 'Time', 'is_suspicious': 'Suspicious'}
221
+ )
222
+ visualizations['time_series'] = fig2
223
+
224
+ # 3. Fraud score distribution
225
+ fig3 = px.histogram(
226
+ df, x='fraud_score',
227
+ title='Distribution of Fraud Scores',
228
+ labels={'fraud_score': 'Fraud Score'}
229
+ )
230
+ visualizations['fraud_score_dist'] = fig3
231
+
232
+ # 4. Hourly transaction pattern (if timestamp available)
233
+ if 'timestamp' in df.columns:
234
+ hourly_counts = df.groupby([df['timestamp'].dt.hour, 'is_suspicious']).size().reset_index(name='count')
235
+ fig4 = px.line(
236
+ hourly_counts, x='timestamp', y='count', color='is_suspicious',
237
+ color_discrete_map={True: 'red', False: 'blue'},
238
+ title='Hourly Transaction Pattern',
239
+ labels={'timestamp': 'Hour of Day', 'count': 'Number of Transactions', 'is_suspicious': 'Suspicious'}
240
+ )
241
+ visualizations['hourly_pattern'] = fig4
242
+
243
+ return visualizations
244
+
245
+ def process_transactions(file):
246
+ """Main function to process transaction data and detect fraud"""
247
+ try:
248
+ # Load and preprocess data
249
+ df = load_and_preprocess_data(file)
250
+ if df is None:
251
+ return "No file uploaded", None, None, None, None, None
252
+
253
+ # Detect fraud and anomalies
254
+ df_with_anomalies = detect_fraud_and_anomalies(df)
255
+
256
+ # Get suspicious transactions
257
+ suspicious_transactions = df_with_anomalies[df_with_anomalies['is_suspicious']]
258
+
259
+ # Create visualizations
260
+ visualizations = create_visualizations(df_with_anomalies)
261
+
262
+ # Basic statistics
263
+ total_transactions = len(df_with_anomalies)
264
+ suspicious_count = len(suspicious_transactions)
265
+ suspicious_percentage = round((suspicious_count / total_transactions) * 100, 2)
266
+
267
+ # Format statistics for display
268
+ stats_summary = f"""
269
+ ## Transaction Analysis Summary
270
+
271
+ - **Total Transactions**: {total_transactions}
272
+ - **Suspicious Transactions**: {suspicious_count} ({suspicious_percentage}%)
273
+ - **Total Transaction Value**: ${df_with_anomalies['amount'].sum():,.2f}
274
+ - **Suspicious Transaction Value**: ${suspicious_transactions['amount'].sum():,.2f}
275
+ - **Average Transaction Amount**: ${df_with_anomalies['amount'].mean():,.2f}
276
+ - **Average Suspicious Amount**: ${suspicious_transactions['amount'].mean():,.2f}
277
+ """
278
+
279
+ # Get AI analysis of suspicious transactions
280
+ ai_analysis = analyze_transaction_with_ai(df_with_anomalies, suspicious_transactions)
281
+
282
+ # Return results and visualizations
283
+ return (
284
+ stats_summary,
285
+ ai_analysis,
286
+ suspicious_transactions.to_csv(index=False),
287
+ visualizations.get('amount_distribution', None),
288
+ visualizations.get('time_series', None),
289
+ visualizations.get('fraud_score_dist', None)
290
+ )
291
+
292
+ except Exception as e:
293
+ return f"Error: {str(e)}", None, None, None, None, None
294
+
295
+ def create_gradio_interface():
296
+ """Create Gradio interface for the application"""
297
+ with gr.Blocks(title="AI Fraud Detection System") as app:
298
+ gr.Markdown("# AI Transaction Fraud & Anomaly Detection System")
299
+ gr.Markdown("Upload your transaction data (CSV or Excel) to detect potential fraud and anomalies.")
300
+
301
+ with gr.Row():
302
+ file_input = gr.File(label="Upload Transaction Data", file_types=[".csv", ".xlsx", ".xls"])
303
+
304
+ with gr.Row():
305
+ submit_btn = gr.Button("Analyze Transactions", variant="primary")
306
+
307
+ with gr.Tabs():
308
+ with gr.TabItem("Summary"):
309
+ stats_output = gr.Markdown(label="Statistics Summary")
310
+ ai_analysis_output = gr.Markdown(label="AI Analysis")
311
+
312
+ with gr.TabItem("Visualizations"):
313
+ with gr.Row():
314
+ amount_dist_plot = gr.Plot(label="Transaction Amount Distribution")
315
+
316
+ with gr.Row():
317
+ time_series_plot = gr.Plot(label="Transactions Over Time")
318
+ fraud_score_plot = gr.Plot(label="Fraud Score Distribution")
319
+
320
+ with gr.TabItem("Suspicious Transactions"):
321
+ suspicious_csv = gr.File(label="Download Suspicious Transactions (CSV)")
322
+
323
+ submit_btn.click(
324
+ process_transactions,
325
+ inputs=[file_input],
326
+ outputs=[stats_output, ai_analysis_output, suspicious_csv,
327
+ amount_dist_plot, time_series_plot, fraud_score_plot]
328
+ )
329
+
330
+ return app
331
+
332
+ if __name__ == "__main__":
333
+ app = create_gradio_interface()
334
+ app.launch(share=True)