File size: 7,413 Bytes
69cbf5f
 
 
 
 
 
 
 
 
 
 
 
 
 
405dcde
69cbf5f
 
 
 
7ca1795
 
405dcde
7ca1795
 
69cbf5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ca1795
 
 
69cbf5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405dcde
 
69cbf5f
 
 
 
 
 
 
 
 
 
 
 
7ca1795
69cbf5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ca1795
69cbf5f
 
 
 
 
7ca1795
 
 
69cbf5f
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import gradio as gr
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
import matplotlib
matplotlib.use('Agg')
import io
import base64
from PIL import Image
import os
from openai import OpenAI

# Path to the CSV file in the environment
CSV_PATH = 'FI_Transactions.csv'

# Get OpenAI API key from environment variables
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

def detect_anomalies(nu_value, n_clusters):
    # Read the CSV file from the environment
    df = pd.read_csv(CSV_PATH)
    
    # Data preprocessing and scaling
    features = df.select_dtypes(include=[np.number])
    feature_names = features.columns.tolist()
    scaled_features = scale(features)
    
    # Train One-Class SVM for anomaly detection
    svm_model = OneClassSVM(kernel='rbf', nu=nu_value, gamma='scale')
    svm_model.fit(scaled_features)
    
    # Predict anomalies
    svm_preds = svm_model.predict(scaled_features)
    df['SVM_Anomaly'] = ['Anomaly' if x == -1 else 'Normal' for x in svm_preds]
    
    # Count anomalies
    anomaly_count = (df['SVM_Anomaly'] == 'Anomaly').sum()
    normal_count = (df['SVM_Anomaly'] == 'Normal').sum()
    
    # Train KMeans for clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(scaled_features)
    df['KMeans_Cluster'] = kmeans.labels_
    
    # Create visualizations
    
    # 1. Anomalies count pie chart
    plt.figure(figsize=(8, 6))
    plt.pie([anomaly_count, normal_count], labels=['Anomalies', 'Normal'], autopct='%1.1f%%', colors=['#FF9999', '#66B2FF'])
    plt.title('SVM Anomaly Detection Results')
    pie_chart_img = plt_to_img()
    
    # 2. KMeans clustering scatter plot (using first two features)
    plt.figure(figsize=(10, 6))
    x_feature = 0 if len(feature_names) > 0 else 0
    y_feature = 1 if len(feature_names) > 1 else 0
    
    scatter = plt.scatter(scaled_features[:, x_feature], 
                 scaled_features[:, y_feature], 
                 c=kmeans.labels_, 
                 cmap='viridis', 
                 alpha=0.7)
    plt.colorbar(scatter, label='Cluster')
    plt.title('KMeans Clustering Results')
    plt.xlabel(feature_names[x_feature] if len(feature_names) > x_feature else "Feature 1")
    plt.ylabel(feature_names[y_feature] if len(feature_names) > y_feature else "Feature 2")
    kmeans_img = plt_to_img()
    
    # 3. SVM anomalies scatter plot
    plt.figure(figsize=(10, 6))
    colors = ['red' if x == 'Anomaly' else 'blue' for x in df['SVM_Anomaly']]
    plt.scatter(scaled_features[:, x_feature], 
                scaled_features[:, y_feature], 
                c=colors, 
                alpha=0.7)
    plt.title('SVM Anomaly Detection')
    plt.xlabel(feature_names[x_feature] if len(feature_names) > x_feature else "Feature 1")
    plt.ylabel(feature_names[y_feature] if len(feature_names) > y_feature else "Feature 2")
    plt.legend(['Anomaly', 'Normal'])
    svm_img = plt_to_img()
    
    # Create summary dataframe of anomalies
    anomalies_df = df[df['SVM_Anomaly'] == 'Anomaly'].reset_index()
    
    # Get AI insights about anomalies using OpenAI API
    ai_insights = get_ai_insights(df, anomalies_df)
    
    # Convert the dataframe to HTML for display
    anomalies_html = anomalies_df.to_html(classes='table table-striped')
    
    # Create HTML summary
    summary_html = f"""
    <h3>Analysis Summary</h3>
    <p>Total transactions: {len(df)}</p>
    <p>Anomalies detected: {anomaly_count} ({anomaly_count/len(df)*100:.2f}%)</p>
    <p>Normal transactions: {normal_count} ({normal_count/len(df)*100:.2f}%)</p>
    
    <h3>AI Insights</h3>
    <p>{ai_insights}</p>
    """
    
    return pie_chart_img, kmeans_img, svm_img, summary_html, anomalies_html

def get_ai_insights(df, anomalies_df):
    """Get insights about the anomalies using OpenAI API"""
    try:
        if not OPENAI_API_KEY:
            return "OpenAI API key not found in environment variables. AI insights are unavailable."
            
        # Prepare information about the dataset and anomalies
        df_info = df.describe().to_string()
        anomaly_info = anomalies_df.head(5).to_string() if not anomalies_df.empty else "No anomalies detected"
        
        # Create a prompt for the OpenAI API
        prompt = f"""
        Analyze the following financial transaction data and detected anomalies:
        
        Dataset Statistics:
        {df_info}
        
        Sample Anomalies (top 5):
        {anomaly_info}
        
        Please provide:
        1. Possible patterns or reasons for these anomalies
        2. Recommendations for further investigation
        3. Potential risk factors these anomalies might indicate
        
        Keep your analysis concise and focused on financial fraud detection.
        """
        
        # Call the OpenAI API using the new client format
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a financial fraud detection expert."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=500
        )
        
        # Extract and return the insights
        return response.choices[0].message.content
    
    except Exception as e:
        return f"Could not generate AI insights. Error: {str(e)}"

def plt_to_img():
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    img = Image.open(buf)
    return img

# Create the Gradio interface
with gr.Blocks(title="Financial Transaction Anomaly Detection") as demo:
    gr.Markdown("# Financial Transaction Anomaly Detection")
    gr.Markdown(f"Analyzing data from {CSV_PATH}")
    
    with gr.Row():
        with gr.Column():
            nu_slider = gr.Slider(0.01, 0.2, value=0.05, step=0.01, label="SVM nu parameter (controls anomaly threshold)")
            cluster_slider = gr.Slider(2, 10, value=2, step=1, label="Number of KMeans clusters")
            detect_button = gr.Button("Detect Anomalies")
        
        with gr.Column():
            summary_output = gr.HTML(label="Summary")
    
    with gr.Row():
        pie_output = gr.Image(label="Anomaly Distribution")
        svm_output = gr.Image(label="SVM Anomaly Detection")
    
    with gr.Row():
        kmeans_output = gr.Image(label="KMeans Clustering")
    
    with gr.Row():
        anomalies_output = gr.HTML(label="Detected Anomalies")
    
    detect_button.click(
        detect_anomalies,
        inputs=[nu_slider, cluster_slider],
        outputs=[pie_output, kmeans_output, svm_output, summary_output, anomalies_output]
    )
    
    gr.Markdown("""
    ## How to Use
    1. Adjust the SVM nu parameter (controls anomaly detection sensitivity)
    2. Choose the number of clusters for KMeans
    3. Click 'Detect Anomalies' to analyze the data
    
    ## Interpretation
    - The pie chart shows the proportion of normal vs anomalous transactions
    - The scatter plots visualize the clusters and anomalies
    - The AI insights provide expert analysis of detected anomalies
    - The table displays detailed information about detected anomalies
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True)