prernajeet01 commited on
Commit
40c771e
·
verified ·
1 Parent(s): 5c5cc6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +349 -143
app.py CHANGED
@@ -16,7 +16,116 @@ import tempfile
16
  # Set OpenAI API key from Hugging Face Spaces secrets
17
  openai.api_key = os.environ.get("OPENAI_API_KEY")
18
 
19
- def analyze_transaction_with_ai(transaction_data, suspicious_transactions):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  """Use OpenAI to analyze suspicious transactions and provide insights"""
21
  if not openai.api_key:
22
  return "OpenAI API key not found. Please add it to the Hugging Face Spaces secrets."
@@ -25,9 +134,10 @@ def analyze_transaction_with_ai(transaction_data, suspicious_transactions):
25
  # Prepare information for OpenAI, converting to a JSON-serializable format
26
  suspicious_sample = suspicious_transactions.head(5).copy()
27
 
28
- # Convert timestamp to string format to make it JSON serializable
29
- if 'timestamp' in suspicious_sample.columns:
30
- suspicious_sample['timestamp'] = suspicious_sample['timestamp'].astype(str)
 
31
 
32
  # Convert to dictionary
33
  suspicious_dict = suspicious_sample.to_dict(orient='records')
@@ -37,10 +147,16 @@ def analyze_transaction_with_ai(transaction_data, suspicious_transactions):
37
  "total_transactions": int(len(transaction_data)),
38
  "flagged_transactions": int(len(suspicious_transactions)),
39
  "flagged_percentage": float(round(len(suspicious_transactions) / len(transaction_data) * 100, 2)),
40
- "avg_transaction_amount": float(round(transaction_data['amount'].mean(), 2)),
41
- "suspicious_avg_amount": float(round(suspicious_transactions['amount'].mean(), 2))
42
  }
43
 
 
 
 
 
 
 
 
 
44
  # Create prompt for OpenAI
45
  prompt = f"""
46
  Analyze these potentially fraudulent transactions and identify patterns or anomalies:
@@ -48,6 +164,9 @@ def analyze_transaction_with_ai(transaction_data, suspicious_transactions):
48
  Transaction Data Summary:
49
  {json.dumps(summary_stats)}
50
 
 
 
 
51
  Sample of Suspicious Transactions:
52
  {json.dumps(suspicious_dict)}
53
 
@@ -79,7 +198,7 @@ def analyze_transaction_with_ai(transaction_data, suspicious_transactions):
79
  def load_and_preprocess_data(file):
80
  """Load and preprocess transaction data from CSV or Excel file"""
81
  if file is None:
82
- return None
83
 
84
  # Get file extension
85
  file_extension = os.path.splitext(file.name)[1].lower()
@@ -96,155 +215,197 @@ def load_and_preprocess_data(file):
96
  if df.empty:
97
  raise ValueError("The uploaded file is empty.")
98
 
99
- # Check for essential columns
100
- required_columns = ['transaction_id', 'amount', 'timestamp']
101
- missing_columns = [col for col in required_columns if col not in df.columns]
102
-
103
- if missing_columns:
104
- # Try to identify columns that might contain the missing information
105
- if 'transaction_id' in missing_columns and any(col.lower().endswith('id') for col in df.columns):
106
- potential_id_columns = [col for col in df.columns if col.lower().endswith('id')]
107
- if potential_id_columns:
108
- df['transaction_id'] = df[potential_id_columns[0]]
109
- missing_columns.remove('transaction_id')
110
-
111
- if 'amount' in missing_columns and any(col.lower() in ['value', 'sum', 'total', 'price'] for col in df.columns):
112
- potential_amount_columns = [col for col in df.columns if col.lower() in ['value', 'sum', 'total', 'price']]
113
- if potential_amount_columns:
114
- df['amount'] = df[potential_amount_columns[0]]
115
- missing_columns.remove('amount')
116
-
117
- if 'timestamp' in missing_columns and any(col.lower() in ['date', 'time', 'datetime'] for col in df.columns):
118
- potential_time_columns = [col for col in df.columns if col.lower() in ['date', 'time', 'datetime']]
119
- if potential_time_columns:
120
- df['timestamp'] = df[potential_time_columns[0]]
121
- missing_columns.remove('timestamp')
122
-
123
- # If still missing required columns, raise error
124
- if missing_columns:
125
- raise ValueError(f"Missing required columns: {', '.join(missing_columns)}. Please ensure your data includes columns for transaction ID, amount, and timestamp.")
126
-
127
- # Convert timestamp to datetime if it's not already
128
- if not pd.api.types.is_datetime64_any_dtype(df['timestamp']):
129
  try:
130
- df['timestamp'] = pd.to_datetime(df['timestamp'])
131
  except:
132
- raise ValueError("Could not convert timestamp column to datetime format.")
133
 
134
- # Ensure amount is numeric
135
- try:
136
- df['amount'] = pd.to_numeric(df['amount'])
137
- except:
138
- raise ValueError("Could not convert amount column to numeric values.")
 
 
139
 
140
- return df
141
 
142
- def detect_fraud_and_anomalies(df):
143
- """Detect fraud and anomalies in transaction data"""
144
  # Create feature set for anomaly detection
145
- features = df[['amount']].copy()
 
 
 
 
 
146
 
147
  # Add time-based features if available
148
- if 'timestamp' in df.columns:
149
- # Extract hour and day of week without using .dt.to_pydatetime()
150
- features['hour_of_day'] = pd.to_numeric(df['timestamp'].dt.hour)
151
- features['day_of_week'] = pd.to_numeric(df['timestamp'].dt.dayofweek)
 
152
 
153
- # Add other relevant features if available
154
- if 'location' in df.columns:
 
155
  # One-hot encode location
156
- location_dummies = pd.get_dummies(df['location'], prefix='location')
157
  features = pd.concat([features, location_dummies], axis=1)
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  # Standardize features
160
  scaler = StandardScaler()
161
  scaled_features = scaler.fit_transform(features)
162
 
163
  # Apply Isolation Forest for anomaly detection
164
  clf = IsolationForest(contamination=0.05, random_state=42)
165
- df['anomaly_score'] = clf.fit_predict(scaled_features)
 
 
 
166
 
167
- # Flag anomalies (anomaly_score of -1 indicates an anomaly)
168
- df['is_anomaly'] = df['anomaly_score'] == -1
 
169
 
170
- # Additional heuristic rules for fraud detection
171
- # 1. Unusually large transactions
172
- amount_threshold = df['amount'].quantile(0.95)
173
- df['high_amount'] = df['amount'] > amount_threshold
 
 
 
 
 
 
174
 
175
  # 2. Transactions occurring at unusual hours (if timestamp available)
176
- if 'timestamp' in df.columns:
177
- # Fix for datetime warning
178
- hours = np.array(df['timestamp'].dt.hour)
179
- df['unusual_hour'] = np.isin(hours, [0, 1, 2, 3, 4])
180
- else:
181
- df['unusual_hour'] = False
182
 
183
  # 3. Calculate transaction frequency by user or account (if available)
184
- if 'user_id' in df.columns or 'account_id' in df.columns:
185
- id_col = 'user_id' if 'user_id' in df.columns else 'account_id'
186
- transaction_counts = df.groupby(id_col).size().reset_index(name='transaction_count')
187
- df = df.merge(transaction_counts, on=id_col)
188
- df['high_frequency'] = df['transaction_count'] > df['transaction_count'].quantile(0.9)
189
- else:
190
- df['high_frequency'] = False
191
 
192
  # 4. Velocity check: multiple transactions in short time period
193
- if 'timestamp' in df.columns and ('user_id' in df.columns or 'account_id' in df.columns):
194
- id_col = 'user_id' if 'user_id' in df.columns else 'account_id'
195
- df = df.sort_values([id_col, 'timestamp'])
196
-
197
- # Fix for datetime warning by using numpy arrays
198
- time_diffs = df.groupby(id_col)['timestamp'].diff()
199
- # Convert to seconds and handle NaN values
200
- seconds = np.array([td.total_seconds() if pd.notnull(td) else 0 for td in time_diffs])
201
- df['time_diff'] = seconds
202
- df['rapid_succession'] = df['time_diff'] < 300 # Less than 5 minutes
203
- else:
204
- df['rapid_succession'] = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
- # Combine all fraud indicators
207
- df['fraud_score'] = (
208
- df['is_anomaly'].astype(int) * 3 + # Weighted more heavily
209
- df['high_amount'].astype(int) * 2 +
210
- df['unusual_hour'].astype(int) +
211
- df['high_frequency'].astype(int) +
212
- df['rapid_succession'].astype(int)
213
- )
214
 
215
- # Flag as suspicious if fraud score is above threshold
216
- df['is_suspicious'] = df['fraud_score'] >= 3
 
 
217
 
218
- return df
219
 
220
- def create_visualizations(df):
221
- """Create visualizations for transaction data and anomalies"""
222
  visualizations = {}
223
 
224
  try:
225
- # Convert timestamp to string for plotly to avoid datetime warning
226
  plot_df = df.copy()
227
- if 'timestamp' in plot_df.columns:
228
- plot_df['timestamp_str'] = plot_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
229
-
230
- # 1. Distribution of transaction amounts with anomalies highlighted
231
- fig1 = px.histogram(
232
- plot_df, x='amount', color='is_suspicious',
233
- color_discrete_map={True: 'red', False: 'blue'},
234
- title='Distribution of Transaction Amounts',
235
- labels={'amount': 'Transaction Amount', 'is_suspicious': 'Suspicious'}
236
- )
237
- # Ensure the figure is fully rendered
238
- fig1.update_layout(height=500, width=700)
239
- visualizations['amount_distribution'] = fig1
240
 
241
- # 2. Time series of transaction amounts
242
- if 'timestamp' in plot_df.columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  fig2 = px.scatter(
244
- plot_df, x='timestamp_str', y='amount', color='is_suspicious',
245
  color_discrete_map={True: 'red', False: 'blue'},
246
  title='Transaction Amounts Over Time',
247
- labels={'amount': 'Transaction Amount', 'timestamp_str': 'Time', 'is_suspicious': 'Suspicious'}
248
  )
249
  fig2.update_layout(height=500, width=700)
250
  visualizations['time_series'] = fig2
@@ -258,21 +419,38 @@ def create_visualizations(df):
258
  fig3.update_layout(height=500, width=700)
259
  visualizations['fraud_score_dist'] = fig3
260
 
261
- # 4. Hourly transaction pattern (if timestamp available)
262
- if 'timestamp' in plot_df.columns:
263
- # Fixed approach to get hourly data
264
- hourly_counts = plot_df.groupby([plot_df['timestamp'].dt.hour, 'is_suspicious']).size()
265
- hourly_df = hourly_counts.reset_index()
266
- hourly_df.columns = ['hour', 'is_suspicious', 'count']
267
 
268
- fig4 = px.line(
269
- hourly_df, x='hour', y='count', color='is_suspicious',
270
  color_discrete_map={True: 'red', False: 'blue'},
271
- title='Hourly Transaction Pattern',
272
- labels={'hour': 'Hour of Day', 'count': 'Number of Transactions', 'is_suspicious': 'Suspicious'}
273
  )
274
  fig4.update_layout(height=500, width=700)
275
- visualizations['hourly_pattern'] = fig4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  except Exception as e:
278
  print(f"Error in visualization creation: {str(e)}")
@@ -282,19 +460,24 @@ def create_visualizations(df):
282
  def process_transactions(file):
283
  """Main function to process transaction data and detect fraud"""
284
  try:
285
- # Load and preprocess data
286
- df = load_and_preprocess_data(file)
287
- if df is None:
288
- return "No file uploaded", None, None, None, None, None
 
 
 
 
 
289
 
290
- # Detect fraud and anomalies
291
- df_with_anomalies = detect_fraud_and_anomalies(df)
292
 
293
  # Get suspicious transactions
294
  suspicious_transactions = df_with_anomalies[df_with_anomalies['is_suspicious']]
295
 
296
- # Create visualizations
297
- visualizations = create_visualizations(df_with_anomalies)
298
 
299
  # Basic statistics
300
  total_transactions = len(df_with_anomalies)
@@ -307,14 +490,37 @@ def process_transactions(file):
307
 
308
  - **Total Transactions**: {total_transactions}
309
  - **Suspicious Transactions**: {suspicious_count} ({suspicious_percentage}%)
310
- - **Total Transaction Value**: ${df_with_anomalies['amount'].sum():,.2f}
311
- - **Suspicious Transaction Value**: ${suspicious_transactions['amount'].sum():,.2f}
312
- - **Average Transaction Amount**: ${df_with_anomalies['amount'].mean():,.2f}
313
- - **Average Suspicious Amount**: ${suspicious_transactions['amount'].mean():,.2f}
314
  """
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  # Get AI analysis of suspicious transactions
317
- ai_analysis = analyze_transaction_with_ai(df_with_anomalies, suspicious_transactions)
318
 
319
  # Save suspicious transactions to a temporary file
320
  temp_csv = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
@@ -340,7 +546,7 @@ def create_gradio_interface():
340
  """Create Gradio interface for the application"""
341
  with gr.Blocks(title="AI Fraud Detection System") as app:
342
  gr.Markdown("# AI Transaction Fraud & Anomaly Detection System")
343
- gr.Markdown("Upload your transaction data (CSV or Excel) to detect potential fraud and anomalies.")
344
 
345
  with gr.Row():
346
  file_input = gr.File(label="Upload Transaction Data", file_types=[".csv", ".xlsx", ".xls"])
 
16
  # Set OpenAI API key from Hugging Face Spaces secrets
17
  openai.api_key = os.environ.get("OPENAI_API_KEY")
18
 
19
+ def analyze_dataset_structure(df):
20
+ """Use OpenAI to analyze the dataset structure and identify relevant columns"""
21
+ if not openai.api_key:
22
+ return None, "OpenAI API key not found. Please add it to the Hugging Face Spaces secrets."
23
+
24
+ try:
25
+ # Get basic dataset info
26
+ sample_data = df.head(3).to_dict(orient='records')
27
+ column_info = []
28
+
29
+ for col in df.columns:
30
+ dtype = str(df[col].dtype)
31
+ unique_values = len(df[col].unique())
32
+ null_percentage = round((df[col].isna().sum() / len(df)) * 100, 2)
33
+ sample_values = df[col].dropna().sample(min(3, len(df[col].dropna()))).tolist()
34
+
35
+ column_info.append({
36
+ "column_name": col,
37
+ "data_type": dtype,
38
+ "unique_values_count": unique_values,
39
+ "null_percentage": null_percentage,
40
+ "sample_values": str(sample_values)[:100] # Limit sample length
41
+ })
42
+
43
+ # Create prompt for OpenAI
44
+ prompt = f"""
45
+ Analyze this transaction dataset structure to identify the purpose of each column.
46
+
47
+ Dataset Information:
48
+ - Number of rows: {len(df)}
49
+ - Number of columns: {len(df.columns)}
50
+
51
+ Column Information:
52
+ {json.dumps(column_info, indent=2)}
53
+
54
+ Sample Data:
55
+ {json.dumps(sample_data, indent=2)}
56
+
57
+ For each column in the dataset, identify its likely purpose in a transaction dataset.
58
+ Specifically identify:
59
+
60
+ 1. Which column is likely the transaction ID or reference number
61
+ 2. Which column represents the transaction amount or value
62
+ 3. Which column represents the timestamp or date of the transaction
63
+ 4. Which column represents the user ID, account ID, or customer identifier
64
+ 5. Which column might represent location information
65
+ 6. Which columns might be useful for fraud detection (e.g., IP address, device info, transaction status)
66
+
67
+ Return your analysis as a JSON object with this structure:
68
+ {
69
+ "id_column": "column_name",
70
+ "amount_column": "column_name",
71
+ "timestamp_column": "column_name",
72
+ "user_column": "column_name",
73
+ "location_column": "column_name",
74
+ "fraud_indicator_columns": ["column1", "column2"],
75
+ "column_descriptions": {
76
+ "column_name": "description of purpose"
77
+ }
78
+ }
79
+
80
+ Include only columns that you're reasonably confident about, and use null for any category where you can't identify a matching column.
81
+ """
82
+
83
+ # Create an OpenAI client with the API key
84
+ client = openai.OpenAI(api_key=openai.api_key)
85
+
86
+ # Call OpenAI API
87
+ response = client.chat.completions.create(
88
+ model="gpt-3.5-turbo",
89
+ messages=[
90
+ {"role": "system", "content": "You are a data analysis expert specializing in financial transaction data structures."},
91
+ {"role": "user", "content": prompt}
92
+ ],
93
+ max_tokens=1000,
94
+ response_format={"type": "json_object"}
95
+ )
96
+
97
+ # Parse the JSON response
98
+ structure_analysis = json.loads(response.choices[0].message.content)
99
+
100
+ # Also get a natural language explanation
101
+ explanation_prompt = f"""
102
+ Based on your analysis of the dataset structure, provide a brief natural language explanation of:
103
+ 1. What kind of transactions this dataset appears to contain
104
+ 2. What the key columns are and what they represent
105
+ 3. What approach would be best for detecting anomalies or fraud in this specific dataset
106
+
107
+ Keep your explanation concise and focused on the unique characteristics of this dataset.
108
+ """
109
+
110
+ explanation_response = client.chat.completions.create(
111
+ model="gpt-3.5-turbo",
112
+ messages=[
113
+ {"role": "system", "content": "You are a data analysis expert specializing in financial transaction data structures."},
114
+ {"role": "user", "content": prompt},
115
+ {"role": "assistant", "content": response.choices[0].message.content},
116
+ {"role": "user", "content": explanation_prompt}
117
+ ],
118
+ max_tokens=500
119
+ )
120
+
121
+ explanation = explanation_response.choices[0].message.content
122
+
123
+ return structure_analysis, explanation
124
+
125
+ except Exception as e:
126
+ return None, f"Error analyzing dataset structure: {str(e)}"
127
+
128
+ def analyze_transaction_with_ai(transaction_data, suspicious_transactions, column_mapping):
129
  """Use OpenAI to analyze suspicious transactions and provide insights"""
130
  if not openai.api_key:
131
  return "OpenAI API key not found. Please add it to the Hugging Face Spaces secrets."
 
134
  # Prepare information for OpenAI, converting to a JSON-serializable format
135
  suspicious_sample = suspicious_transactions.head(5).copy()
136
 
137
+ # Convert any datetime columns to string format to make it JSON serializable
138
+ for col in suspicious_sample.columns:
139
+ if pd.api.types.is_datetime64_any_dtype(suspicious_sample[col]):
140
+ suspicious_sample[col] = suspicious_sample[col].astype(str)
141
 
142
  # Convert to dictionary
143
  suspicious_dict = suspicious_sample.to_dict(orient='records')
 
147
  "total_transactions": int(len(transaction_data)),
148
  "flagged_transactions": int(len(suspicious_transactions)),
149
  "flagged_percentage": float(round(len(suspicious_transactions) / len(transaction_data) * 100, 2)),
 
 
150
  }
151
 
152
+ # Add amount-related statistics if available
153
+ amount_col = column_mapping.get("amount_column")
154
+ if amount_col and amount_col in transaction_data.columns:
155
+ summary_stats.update({
156
+ "avg_transaction_amount": float(round(transaction_data[amount_col].mean(), 2)),
157
+ "suspicious_avg_amount": float(round(suspicious_transactions[amount_col].mean(), 2))
158
+ })
159
+
160
  # Create prompt for OpenAI
161
  prompt = f"""
162
  Analyze these potentially fraudulent transactions and identify patterns or anomalies:
 
164
  Transaction Data Summary:
165
  {json.dumps(summary_stats)}
166
 
167
+ Column Mapping:
168
+ {json.dumps(column_mapping)}
169
+
170
  Sample of Suspicious Transactions:
171
  {json.dumps(suspicious_dict)}
172
 
 
198
  def load_and_preprocess_data(file):
199
  """Load and preprocess transaction data from CSV or Excel file"""
200
  if file is None:
201
+ return None, None
202
 
203
  # Get file extension
204
  file_extension = os.path.splitext(file.name)[1].lower()
 
215
  if df.empty:
216
  raise ValueError("The uploaded file is empty.")
217
 
218
+ # Analyze dataset structure with LLM
219
+ column_mapping, dataset_explanation = analyze_dataset_structure(df)
220
+
221
+ # If LLM analysis failed, perform basic preprocessing
222
+ if column_mapping is None:
223
+ return df, dataset_explanation
224
+
225
+ # Process the data based on identified columns
226
+ processed_df = df.copy()
227
+
228
+ # Convert timestamp to datetime if identified
229
+ timestamp_col = column_mapping.get("timestamp_column")
230
+ if timestamp_col and timestamp_col in df.columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  try:
232
+ processed_df[timestamp_col] = pd.to_datetime(df[timestamp_col])
233
  except:
234
+ print(f"Warning: Could not convert {timestamp_col} to datetime format.")
235
 
236
+ # Ensure amount column is numeric if identified
237
+ amount_col = column_mapping.get("amount_column")
238
+ if amount_col and amount_col in df.columns:
239
+ try:
240
+ processed_df[amount_col] = pd.to_numeric(df[amount_col])
241
+ except:
242
+ print(f"Warning: Could not convert {amount_col} to numeric values.")
243
 
244
+ return processed_df, dataset_explanation, column_mapping
245
 
246
+ def detect_fraud_and_anomalies(df, column_mapping):
247
+ """Detect fraud and anomalies in transaction data based on LLM-identified columns"""
248
  # Create feature set for anomaly detection
249
+ features = pd.DataFrame()
250
+
251
+ # Add amount feature if available
252
+ amount_col = column_mapping.get("amount_column")
253
+ if amount_col and amount_col in df.columns:
254
+ features['amount'] = df[amount_col]
255
 
256
  # Add time-based features if available
257
+ timestamp_col = column_mapping.get("timestamp_column")
258
+ if timestamp_col and timestamp_col in df.columns and pd.api.types.is_datetime64_any_dtype(df[timestamp_col]):
259
+ # Extract hour and day of week
260
+ features['hour_of_day'] = pd.to_numeric(df[timestamp_col].dt.hour)
261
+ features['day_of_week'] = pd.to_numeric(df[timestamp_col].dt.dayofweek)
262
 
263
+ # Add location feature if available
264
+ location_col = column_mapping.get("location_column")
265
+ if location_col and location_col in df.columns:
266
  # One-hot encode location
267
+ location_dummies = pd.get_dummies(df[location_col], prefix='location')
268
  features = pd.concat([features, location_dummies], axis=1)
269
 
270
+ # Add fraud indicator columns if identified
271
+ fraud_indicators = column_mapping.get("fraud_indicator_columns", [])
272
+ for col in fraud_indicators:
273
+ if col in df.columns:
274
+ if pd.api.types.is_numeric_dtype(df[col]):
275
+ features[col] = df[col]
276
+ else:
277
+ # One-hot encode categorical indicators
278
+ indicator_dummies = pd.get_dummies(df[col], prefix=col)
279
+ features = pd.concat([features, indicator_dummies], axis=1)
280
+
281
+ # If still no features available, use all numeric columns
282
+ if features.empty or features.shape[1] < 2:
283
+ numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
284
+ if numeric_cols:
285
+ for col in numeric_cols:
286
+ if col not in features.columns:
287
+ features[col] = df[col]
288
+
289
+ # If still not enough features, add dummy feature
290
+ if features.empty or features.shape[1] < 2:
291
+ features['dummy1'] = np.random.random(len(df))
292
+ features['dummy2'] = np.random.random(len(df))
293
+
294
  # Standardize features
295
  scaler = StandardScaler()
296
  scaled_features = scaler.fit_transform(features)
297
 
298
  # Apply Isolation Forest for anomaly detection
299
  clf = IsolationForest(contamination=0.05, random_state=42)
300
+ anomaly_scores = clf.fit_predict(scaled_features)
301
+
302
+ # Create a result DataFrame with original data and anomaly scores
303
+ result_df = df.copy()
304
 
305
+ # Add anomaly flags
306
+ result_df['anomaly_score'] = anomaly_scores
307
+ result_df['is_anomaly'] = result_df['anomaly_score'] == -1
308
 
309
+ # Initialize fraud indicators
310
+ result_df['high_amount'] = False
311
+ result_df['unusual_hour'] = False
312
+ result_df['high_frequency'] = False
313
+ result_df['rapid_succession'] = False
314
+
315
+ # 1. Unusually large transactions (if amount column is available)
316
+ if amount_col and amount_col in df.columns:
317
+ amount_threshold = df[amount_col].quantile(0.95)
318
+ result_df['high_amount'] = df[amount_col] > amount_threshold
319
 
320
  # 2. Transactions occurring at unusual hours (if timestamp available)
321
+ if timestamp_col and timestamp_col in df.columns and pd.api.types.is_datetime64_any_dtype(df[timestamp_col]):
322
+ hours = np.array(df[timestamp_col].dt.hour)
323
+ result_df['unusual_hour'] = np.isin(hours, [0, 1, 2, 3, 4])
 
 
 
324
 
325
  # 3. Calculate transaction frequency by user or account (if available)
326
+ user_col = column_mapping.get("user_column")
327
+ if user_col and user_col in df.columns:
328
+ transaction_counts = df.groupby(user_col).size().reset_index(name='transaction_count')
329
+ result_df = result_df.merge(transaction_counts, on=user_col, how='left')
330
+ result_df['high_frequency'] = result_df['transaction_count'] > result_df['transaction_count'].quantile(0.9)
 
 
331
 
332
  # 4. Velocity check: multiple transactions in short time period
333
+ if timestamp_col and user_col and timestamp_col in df.columns and user_col in df.columns:
334
+ if pd.api.types.is_datetime64_any_dtype(df[timestamp_col]):
335
+ velocity_df = df[[timestamp_col, user_col]].copy().sort_values([user_col, timestamp_col])
336
+ velocity_df['time_diff'] = velocity_df.groupby(user_col)[timestamp_col].diff()
337
+
338
+ # Handle potential NaT values
339
+ velocity_df['time_diff_seconds'] = velocity_df['time_diff'].dt.total_seconds().fillna(0)
340
+ velocity_df['rapid_succession'] = velocity_df['time_diff_seconds'] < 300 # Less than 5 minutes
341
+
342
+ # Map back to the original DataFrame
343
+ result_df = result_df.merge(
344
+ velocity_df[['rapid_succession']],
345
+ left_index=True,
346
+ right_index=True,
347
+ how='left'
348
+ )
349
+ result_df['rapid_succession'] = result_df['rapid_succession'].fillna(False)
350
+
351
+ # Combine all fraud indicators with adaptive weighting
352
+ weights = {
353
+ 'is_anomaly': 3, # Base weight for anomaly detection
354
+ 'high_amount': 2,
355
+ 'unusual_hour': 1,
356
+ 'high_frequency': 1,
357
+ 'rapid_succession': 1
358
+ }
359
 
360
+ # Calculate fraud score based on available indicators
361
+ result_df['fraud_score'] = 0
362
+ for indicator, weight in weights.items():
363
+ if indicator in result_df.columns:
364
+ result_df['fraud_score'] += result_df[indicator].astype(int) * weight
 
 
 
365
 
366
+ # Flag as suspicious if fraud score is above threshold (adapt based on available indicators)
367
+ available_weights = sum([weight for indicator, weight in weights.items() if indicator in result_df.columns])
368
+ threshold = max(3, available_weights * 0.3) # At least 3 or 30% of max possible score
369
+ result_df['is_suspicious'] = result_df['fraud_score'] >= threshold
370
 
371
+ return result_df
372
 
373
+ def create_visualizations(df, column_mapping):
374
+ """Create visualizations for transaction data and anomalies based on LLM-identified columns"""
375
  visualizations = {}
376
 
377
  try:
378
+ # Prepare a copy for plotting
379
  plot_df = df.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
+ # Get important columns
382
+ timestamp_col = column_mapping.get("timestamp_column")
383
+ amount_col = column_mapping.get("amount_column")
384
+ user_col = column_mapping.get("user_column")
385
+
386
+ # Convert timestamp to string for plotly if it exists
387
+ if timestamp_col and timestamp_col in plot_df.columns:
388
+ if pd.api.types.is_datetime64_any_dtype(plot_df[timestamp_col]):
389
+ plot_df['timestamp_str'] = plot_df[timestamp_col].dt.strftime('%Y-%m-%d %H:%M:%S')
390
+
391
+ # 1. Distribution of transaction amounts with anomalies highlighted (if amount column exists)
392
+ if amount_col and amount_col in plot_df.columns:
393
+ fig1 = px.histogram(
394
+ plot_df, x=amount_col, color='is_suspicious',
395
+ color_discrete_map={True: 'red', False: 'blue'},
396
+ title='Distribution of Transaction Amounts',
397
+ labels={amount_col: 'Transaction Amount', 'is_suspicious': 'Suspicious'}
398
+ )
399
+ fig1.update_layout(height=500, width=700)
400
+ visualizations['amount_distribution'] = fig1
401
+
402
+ # 2. Time series of transaction amounts (if both timestamp and amount columns exist)
403
+ if timestamp_col and amount_col and 'timestamp_str' in plot_df.columns:
404
  fig2 = px.scatter(
405
+ plot_df, x='timestamp_str', y=amount_col, color='is_suspicious',
406
  color_discrete_map={True: 'red', False: 'blue'},
407
  title='Transaction Amounts Over Time',
408
+ labels={amount_col: 'Transaction Amount', 'timestamp_str': 'Time', 'is_suspicious': 'Suspicious'}
409
  )
410
  fig2.update_layout(height=500, width=700)
411
  visualizations['time_series'] = fig2
 
419
  fig3.update_layout(height=500, width=700)
420
  visualizations['fraud_score_dist'] = fig3
421
 
422
+ # 4. User transaction frequency (if user column exists)
423
+ if user_col and user_col in plot_df.columns:
424
+ user_counts = plot_df.groupby([user_col, 'is_suspicious']).size().reset_index(name='count')
425
+ # Limit to top 20 users by transaction count
426
+ top_users = plot_df.groupby(user_col).size().sort_values(ascending=False).head(20).index
427
+ user_counts_filtered = user_counts[user_counts[user_col].isin(top_users)]
428
 
429
+ fig4 = px.bar(
430
+ user_counts_filtered, x=user_col, y='count', color='is_suspicious',
431
  color_discrete_map={True: 'red', False: 'blue'},
432
+ title='Transaction Frequency by User (Top 20)',
433
+ labels={user_col: 'User', 'count': 'Number of Transactions', 'is_suspicious': 'Suspicious'}
434
  )
435
  fig4.update_layout(height=500, width=700)
436
+ visualizations['user_frequency'] = fig4
437
+
438
+ # 5. Hourly transaction pattern (if timestamp available)
439
+ if timestamp_col and timestamp_col in plot_df.columns:
440
+ if pd.api.types.is_datetime64_any_dtype(plot_df[timestamp_col]):
441
+ # Get hourly data
442
+ hourly_counts = plot_df.groupby([plot_df[timestamp_col].dt.hour, 'is_suspicious']).size()
443
+ hourly_df = hourly_counts.reset_index()
444
+ hourly_df.columns = ['hour', 'is_suspicious', 'count']
445
+
446
+ fig5 = px.line(
447
+ hourly_df, x='hour', y='count', color='is_suspicious',
448
+ color_discrete_map={True: 'red', False: 'blue'},
449
+ title='Hourly Transaction Pattern',
450
+ labels={'hour': 'Hour of Day', 'count': 'Number of Transactions', 'is_suspicious': 'Suspicious'}
451
+ )
452
+ fig5.update_layout(height=500, width=700)
453
+ visualizations['hourly_pattern'] = fig5
454
 
455
  except Exception as e:
456
  print(f"Error in visualization creation: {str(e)}")
 
460
  def process_transactions(file):
461
  """Main function to process transaction data and detect fraud"""
462
  try:
463
+ # Load and preprocess data with LLM-based analysis
464
+ processed_df, dataset_explanation, column_mapping = load_and_preprocess_data(file)
465
+
466
+ if processed_df is None:
467
+ return "No file uploaded or error in processing", None, None, None, None, None
468
+
469
+ # If column_mapping is None, only dataset_explanation was returned (containing error message)
470
+ if column_mapping is None:
471
+ return f"Error analyzing dataset: {dataset_explanation}", None, None, None, None, None
472
 
473
+ # Detect fraud and anomalies using the LLM-identified column mapping
474
+ df_with_anomalies = detect_fraud_and_anomalies(processed_df, column_mapping)
475
 
476
  # Get suspicious transactions
477
  suspicious_transactions = df_with_anomalies[df_with_anomalies['is_suspicious']]
478
 
479
+ # Create visualizations using the identified columns
480
+ visualizations = create_visualizations(df_with_anomalies, column_mapping)
481
 
482
  # Basic statistics
483
  total_transactions = len(df_with_anomalies)
 
490
 
491
  - **Total Transactions**: {total_transactions}
492
  - **Suspicious Transactions**: {suspicious_count} ({suspicious_percentage}%)
 
 
 
 
493
  """
494
 
495
+ # Add amount-related statistics if available
496
+ amount_col = column_mapping.get("amount_column")
497
+ if amount_col and amount_col in df_with_anomalies.columns:
498
+ stats_summary += f"""
499
+ - **Total Transaction Value**: ${df_with_anomalies[amount_col].sum():,.2f}
500
+ - **Suspicious Transaction Value**: ${suspicious_transactions[amount_col].sum():,.2f}
501
+ - **Average Transaction Amount**: ${df_with_anomalies[amount_col].mean():,.2f}
502
+ - **Average Suspicious Amount**: ${suspicious_transactions[amount_col].mean():,.2f}
503
+ """
504
+
505
+ # Add dataset explanation from LLM
506
+ stats_summary += f"""
507
+ ## Dataset Analysis
508
+
509
+ {dataset_explanation}
510
+
511
+ ## Detected Columns
512
+ """
513
+ for purpose, col_name in column_mapping.items():
514
+ if col_name and purpose not in ["column_descriptions", "fraud_indicator_columns"]:
515
+ stats_summary += f"- **{purpose.replace('_column', '')}**: {col_name}\n"
516
+
517
+ if column_mapping.get("fraud_indicator_columns"):
518
+ stats_summary += "\n**Potential Fraud Indicator Columns**:\n"
519
+ for col in column_mapping.get("fraud_indicator_columns", []):
520
+ stats_summary += f"- {col}\n"
521
+
522
  # Get AI analysis of suspicious transactions
523
+ ai_analysis = analyze_transaction_with_ai(df_with_anomalies, suspicious_transactions, column_mapping)
524
 
525
  # Save suspicious transactions to a temporary file
526
  temp_csv = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
 
546
  """Create Gradio interface for the application"""
547
  with gr.Blocks(title="AI Fraud Detection System") as app:
548
  gr.Markdown("# AI Transaction Fraud & Anomaly Detection System")
549
+ gr.Markdown("Upload your transaction data (CSV or Excel) to detect potential fraud and anomalies. The system will use AI to analyze your dataset structure and identify relevant columns.")
550
 
551
  with gr.Row():
552
  file_input = gr.File(label="Upload Transaction Data", file_types=[".csv", ".xlsx", ".xls"])