prernajeet01 commited on
Commit
9047d3a
·
verified ·
1 Parent(s): 484ce2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -42
app.py CHANGED
@@ -125,55 +125,88 @@ def analyze_dataset_structure(df):
125
  except Exception as e:
126
  return None, f"Error analyzing dataset structure: {str(e)}"
127
 
128
- def analyze_transaction_with_ai(transaction_data, suspicious_transactions, column_mapping):
129
- """Use OpenAI to analyze suspicious transactions and provide insights"""
130
  if not openai.api_key:
131
- return "OpenAI API key not found. Please add it to the Hugging Face Spaces secrets."
132
 
133
  try:
134
- # Prepare information for OpenAI, converting to a JSON-serializable format
135
- suspicious_sample = suspicious_transactions.head(5).copy()
136
-
137
- # Convert any datetime columns to string format to make it JSON serializable
138
- for col in suspicious_sample.columns:
139
- if pd.api.types.is_datetime64_any_dtype(suspicious_sample[col]):
140
- suspicious_sample[col] = suspicious_sample[col].astype(str)
141
-
142
- # Convert to dictionary
143
- suspicious_dict = suspicious_sample.to_dict(orient='records')
144
-
145
- # Get summary statistics
146
- summary_stats = {
147
- "total_transactions": int(len(transaction_data)),
148
- "flagged_transactions": int(len(suspicious_transactions)),
149
- "flagged_percentage": float(round(len(suspicious_transactions) / len(transaction_data) * 100, 2)),
150
- }
151
 
152
- # Add amount-related statistics if available
153
- amount_col = column_mapping.get("amount_column")
154
- if amount_col and amount_col in transaction_data.columns:
155
- summary_stats.update({
156
- "avg_transaction_amount": float(round(transaction_data[amount_col].mean(), 2)),
157
- "suspicious_avg_amount": float(round(suspicious_transactions[amount_col].mean(), 2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  })
159
 
160
  # Create prompt for OpenAI
161
  prompt = f"""
162
- Analyze these potentially fraudulent transactions and identify patterns or anomalies:
 
 
 
 
163
 
164
- Transaction Data Summary:
165
- {json.dumps(summary_stats)}
 
 
 
166
 
167
- Column Mapping:
168
- {json.dumps(column_mapping)}
169
 
170
- Sample of Suspicious Transactions:
171
- {json.dumps(suspicious_dict)}
 
 
 
 
172
 
173
- Provide a concise fraud analysis report with:
174
- 1. Key patterns and red flags in these transactions
175
- 2. Possible fraud scenarios explaining the anomalies
176
- 3. Recommended next steps for investigation
 
 
 
 
 
 
 
 
 
 
177
  """
178
 
179
  # Create an OpenAI client with the API key
@@ -183,17 +216,45 @@ def analyze_transaction_with_ai(transaction_data, suspicious_transactions, colum
183
  response = client.chat.completions.create(
184
  model="gpt-3.5-turbo",
185
  messages=[
186
- {"role": "system", "content": "You are a fraud detection expert helping analyze suspicious financial transactions."},
187
  {"role": "user", "content": prompt}
188
  ],
189
- max_tokens=800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  )
191
 
192
- # Return the AI analysis
193
- return response.choices[0].message.content
 
194
 
195
  except Exception as e:
196
- return f"Error in AI analysis: {str(e)}"
 
 
197
 
198
  def load_and_preprocess_data(file):
199
  """Load and preprocess transaction data from CSV or Excel file"""
 
125
  except Exception as e:
126
  return None, f"Error analyzing dataset structure: {str(e)}"
127
 
128
+ def analyze_dataset_structure(df):
129
+ """Use OpenAI to analyze the dataset structure and identify relevant columns"""
130
  if not openai.api_key:
131
+ return None, "OpenAI API key not found. Please add it to the Hugging Face Spaces secrets."
132
 
133
  try:
134
+ # Get basic dataset info
135
+ sample_data = df.head(3).copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
+ # Convert any non-serializable data types to strings
138
+ for col in sample_data.columns:
139
+ if pd.api.types.is_datetime64_any_dtype(sample_data[col]):
140
+ sample_data[col] = sample_data[col].astype(str)
141
+ elif isinstance(sample_data[col].iloc[0], (np.int64, np.float64)):
142
+ sample_data[col] = sample_data[col].astype(float)
143
+
144
+ # Now convert to dict
145
+ sample_data_dict = sample_data.to_dict(orient='records')
146
+
147
+ column_info = []
148
+
149
+ for col in df.columns:
150
+ dtype = str(df[col].dtype)
151
+ unique_values = len(df[col].unique())
152
+ null_percentage = round((df[col].isna().sum() / len(df)) * 100, 2)
153
+
154
+ # Handle sample values more carefully
155
+ try:
156
+ sample_values = df[col].dropna().sample(min(3, len(df[col].dropna()))).tolist()
157
+ # Convert numpy types to native Python types
158
+ if isinstance(sample_values, list):
159
+ sample_values = [item.item() if hasattr(item, 'item') else str(item) for item in sample_values]
160
+ sample_values_str = str(sample_values)[:100] # Limit sample length
161
+ except:
162
+ sample_values_str = "Error getting sample values"
163
+
164
+ column_info.append({
165
+ "column_name": col,
166
+ "data_type": dtype,
167
+ "unique_values_count": unique_values,
168
+ "null_percentage": null_percentage,
169
+ "sample_values": sample_values_str
170
  })
171
 
172
  # Create prompt for OpenAI
173
  prompt = f"""
174
+ Analyze this transaction dataset structure to identify the purpose of each column.
175
+
176
+ Dataset Information:
177
+ - Number of rows: {len(df)}
178
+ - Number of columns: {len(df.columns)}
179
 
180
+ Column Information:
181
+ {json.dumps(column_info, indent=2)}
182
+
183
+ Sample Data:
184
+ {json.dumps(sample_data_dict, indent=2)}
185
 
186
+ For each column in the dataset, identify its likely purpose in a transaction dataset.
187
+ Specifically identify:
188
 
189
+ 1. Which column is likely the transaction ID or reference number
190
+ 2. Which column represents the transaction amount or value
191
+ 3. Which column represents the timestamp or date of the transaction
192
+ 4. Which column represents the user ID, account ID, or customer identifier
193
+ 5. Which column might represent location information
194
+ 6. Which columns might be useful for fraud detection (e.g., IP address, device info, transaction status)
195
 
196
+ Return your analysis as a JSON object with this structure:
197
+ {{
198
+ "id_column": "column_name",
199
+ "amount_column": "column_name",
200
+ "timestamp_column": "column_name",
201
+ "user_column": "column_name",
202
+ "location_column": "column_name",
203
+ "fraud_indicator_columns": ["column1", "column2"],
204
+ "column_descriptions": {{
205
+ "column_name": "description of purpose"
206
+ }}
207
+ }}
208
+
209
+ Include only columns that you're reasonably confident about, and use null for any category where you can't identify a matching column.
210
  """
211
 
212
  # Create an OpenAI client with the API key
 
216
  response = client.chat.completions.create(
217
  model="gpt-3.5-turbo",
218
  messages=[
219
+ {"role": "system", "content": "You are a data analysis expert specializing in financial transaction data structures."},
220
  {"role": "user", "content": prompt}
221
  ],
222
+ max_tokens=1000,
223
+ response_format={"type": "json_object"}
224
+ )
225
+
226
+ # Parse the JSON response
227
+ structure_analysis = json.loads(response.choices[0].message.content)
228
+
229
+ # Also get a natural language explanation
230
+ explanation_prompt = f"""
231
+ Based on your analysis of the dataset structure, provide a brief natural language explanation of:
232
+ 1. What kind of transactions this dataset appears to contain
233
+ 2. What the key columns are and what they represent
234
+ 3. What approach would be best for detecting anomalies or fraud in this specific dataset
235
+
236
+ Keep your explanation concise and focused on the unique characteristics of this dataset.
237
+ """
238
+
239
+ explanation_response = client.chat.completions.create(
240
+ model="gpt-3.5-turbo",
241
+ messages=[
242
+ {"role": "system", "content": "You are a data analysis expert specializing in financial transaction data structures."},
243
+ {"role": "user", "content": prompt},
244
+ {"role": "assistant", "content": response.choices[0].message.content},
245
+ {"role": "user", "content": explanation_prompt}
246
+ ],
247
+ max_tokens=500
248
  )
249
 
250
+ explanation = explanation_response.choices[0].message.content
251
+
252
+ return structure_analysis, explanation
253
 
254
  except Exception as e:
255
+ import traceback
256
+ error_trace = traceback.format_exc()
257
+ return None, f"Error analyzing dataset structure: {str(e)}\n\nTrace: {error_trace}"
258
 
259
  def load_and_preprocess_data(file):
260
  """Load and preprocess transaction data from CSV or Excel file"""