Spaces:

prernajeet01
/

fraud_detection

Runtime error

App Files Files Community

prernajeet01 commited on Feb 26, 2025

Commit

9047d3a

verified ·

1 Parent(s): 484ce2d

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -42

app.py CHANGED Viewed

@@ -125,55 +125,88 @@ def analyze_dataset_structure(df):
     except Exception as e:
         return None, f"Error analyzing dataset structure: {str(e)}"
-def analyze_transaction_with_ai(transaction_data, suspicious_transactions, column_mapping):
-    """Use OpenAI to analyze suspicious transactions and provide insights"""
     if not openai.api_key:
-        return "OpenAI API key not found. Please add it to the Hugging Face Spaces secrets."
     try:
-        # Prepare information for OpenAI, converting to a JSON-serializable format
-        suspicious_sample = suspicious_transactions.head(5).copy()
-        # Convert any datetime columns to string format to make it JSON serializable
-        for col in suspicious_sample.columns:
-            if pd.api.types.is_datetime64_any_dtype(suspicious_sample[col]):
-                suspicious_sample[col] = suspicious_sample[col].astype(str)
-        # Convert to dictionary
-        suspicious_dict = suspicious_sample.to_dict(orient='records')
-        # Get summary statistics
-        summary_stats = {
-            "total_transactions": int(len(transaction_data)),
-            "flagged_transactions": int(len(suspicious_transactions)),
-            "flagged_percentage": float(round(len(suspicious_transactions) / len(transaction_data) * 100, 2)),
-        }
-        # Add amount-related statistics if available
-        amount_col = column_mapping.get("amount_column")
-        if amount_col and amount_col in transaction_data.columns:
-            summary_stats.update({
-                "avg_transaction_amount": float(round(transaction_data[amount_col].mean(), 2)),
-                "suspicious_avg_amount": float(round(suspicious_transactions[amount_col].mean(), 2))
             })
         # Create prompt for OpenAI
         prompt = f"""
-        Analyze these potentially fraudulent transactions and identify patterns or anomalies:
-        Transaction Data Summary:
-        {json.dumps(summary_stats)}
-        Column Mapping:
-        {json.dumps(column_mapping)}
-        Sample of Suspicious Transactions:
-        {json.dumps(suspicious_dict)}
-        Provide a concise fraud analysis report with:
-        1. Key patterns and red flags in these transactions
-        2. Possible fraud scenarios explaining the anomalies
-        3. Recommended next steps for investigation
         """
         # Create an OpenAI client with the API key
@@ -183,17 +216,45 @@ def analyze_transaction_with_ai(transaction_data, suspicious_transactions, colum
         response = client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[
-                {"role": "system", "content": "You are a fraud detection expert helping analyze suspicious financial transactions."},
                 {"role": "user", "content": prompt}
             ],
-            max_tokens=800
         )
-        # Return the AI analysis
-        return response.choices[0].message.content
     except Exception as e:
-        return f"Error in AI analysis: {str(e)}"
 def load_and_preprocess_data(file):
     """Load and preprocess transaction data from CSV or Excel file"""

     except Exception as e:
         return None, f"Error analyzing dataset structure: {str(e)}"
+def analyze_dataset_structure(df):
+    """Use OpenAI to analyze the dataset structure and identify relevant columns"""
     if not openai.api_key:
+        return None, "OpenAI API key not found. Please add it to the Hugging Face Spaces secrets."
     try:
+        # Get basic dataset info
+        sample_data = df.head(3).copy()
+        # Convert any non-serializable data types to strings
+        for col in sample_data.columns:
+            if pd.api.types.is_datetime64_any_dtype(sample_data[col]):
+                sample_data[col] = sample_data[col].astype(str)
+            elif isinstance(sample_data[col].iloc[0], (np.int64, np.float64)):
+                sample_data[col] = sample_data[col].astype(float)
+        # Now convert to dict
+        sample_data_dict = sample_data.to_dict(orient='records')
+        column_info = []
+        for col in df.columns:
+            dtype = str(df[col].dtype)
+            unique_values = len(df[col].unique())
+            null_percentage = round((df[col].isna().sum() / len(df)) * 100, 2)
+            # Handle sample values more carefully
+            try:
+                sample_values = df[col].dropna().sample(min(3, len(df[col].dropna()))).tolist()
+                # Convert numpy types to native Python types
+                if isinstance(sample_values, list):
+                    sample_values = [item.item() if hasattr(item, 'item') else str(item) for item in sample_values]
+                sample_values_str = str(sample_values)[:100]  # Limit sample length
+            except:
+                sample_values_str = "Error getting sample values"
+            column_info.append({
+                "column_name": col,
+                "data_type": dtype,
+                "unique_values_count": unique_values,
+                "null_percentage": null_percentage,
+                "sample_values": sample_values_str
             })
         # Create prompt for OpenAI
         prompt = f"""
+        Analyze this transaction dataset structure to identify the purpose of each column.
+        Dataset Information:
+        - Number of rows: {len(df)}
+        - Number of columns: {len(df.columns)}
+        Column Information:
+        {json.dumps(column_info, indent=2)}
+        Sample Data:
+        {json.dumps(sample_data_dict, indent=2)}
+        For each column in the dataset, identify its likely purpose in a transaction dataset.
+        Specifically identify:
+        1. Which column is likely the transaction ID or reference number
+        2. Which column represents the transaction amount or value
+        3. Which column represents the timestamp or date of the transaction
+        4. Which column represents the user ID, account ID, or customer identifier
+        5. Which column might represent location information
+        6. Which columns might be useful for fraud detection (e.g., IP address, device info, transaction status)
+        Return your analysis as a JSON object with this structure:
+        {{
+            "id_column": "column_name",
+            "amount_column": "column_name",
+            "timestamp_column": "column_name",
+            "user_column": "column_name",
+            "location_column": "column_name",
+            "fraud_indicator_columns": ["column1", "column2"],
+            "column_descriptions": {{
+                "column_name": "description of purpose"
+            }}
+        }}
+        Include only columns that you're reasonably confident about, and use null for any category where you can't identify a matching column.
         """
         # Create an OpenAI client with the API key
         response = client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=[
+                {"role": "system", "content": "You are a data analysis expert specializing in financial transaction data structures."},
                 {"role": "user", "content": prompt}
             ],
+            max_tokens=1000,
+            response_format={"type": "json_object"}
+        )
+        # Parse the JSON response
+        structure_analysis = json.loads(response.choices[0].message.content)
+        # Also get a natural language explanation
+        explanation_prompt = f"""
+        Based on your analysis of the dataset structure, provide a brief natural language explanation of:
+        1. What kind of transactions this dataset appears to contain
+        2. What the key columns are and what they represent
+        3. What approach would be best for detecting anomalies or fraud in this specific dataset
+        Keep your explanation concise and focused on the unique characteristics of this dataset.
+        """
+        explanation_response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=[
+                {"role": "system", "content": "You are a data analysis expert specializing in financial transaction data structures."},
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": response.choices[0].message.content},
+                {"role": "user", "content": explanation_prompt}
+            ],
+            max_tokens=500
         )
+        explanation = explanation_response.choices[0].message.content
+        return structure_analysis, explanation
     except Exception as e:
+        import traceback
+        error_trace = traceback.format_exc()
+        return None, f"Error analyzing dataset structure: {str(e)}\n\nTrace: {error_trace}"
 def load_and_preprocess_data(file):
     """Load and preprocess transaction data from CSV or Excel file"""