itsalissonsilva commited on
Commit
b0330bc
Β·
verified Β·
1 Parent(s): 9dd1f1b

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +44 -60
src/streamlit_app.py CHANGED
@@ -8,8 +8,8 @@ os.environ["STREAMLIT_HOME"] = "/tmp"
8
  import streamlit as st
9
  import pandas as pd
10
  import json
11
- from sklearn.ensemble import IsolationForest
12
  from openai import OpenAI
 
13
 
14
  # Initialize OpenAI client
15
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
@@ -51,94 +51,78 @@ def query_openai(prompt: str) -> dict:
51
  max_tokens=2048
52
  )
53
  raw_output = response.choices[0].message.content
54
- print("\nπŸ”΅ RAW OUTPUT:\n", raw_output)
55
 
56
  json_start = raw_output.find("{")
57
  json_end = raw_output.rfind("}")
58
  if json_start != -1 and json_end != -1:
59
- return json.loads(raw_output[json_start:json_end+1])
 
60
 
61
  return {"error": "Could not locate JSON structure in LLM response."}
62
  except json.JSONDecodeError as e:
63
- return {"error": f"Failed to parse JSON: {str(e)}"}
64
  except Exception as e:
65
  return {"error": str(e)}
66
 
67
- # ---------------- UI HEADER ----------------
68
- st.set_page_config(page_title="LLM-Assisted Anomaly Detector", layout="wide")
69
- st.title("🧠 LLM-Assisted + πŸ›‘οΈ Isolation Forest Anomaly Detector")
70
 
 
71
  st.markdown("""
72
- Welcome! This app combines two anomaly detection approaches:
73
- - πŸ›‘οΈ **Isolation Forest** to flag numeric and structural outliers across the whole dataset
74
- - πŸ€– **LLM Analysis** to detect unusual values in a **single column** (like odd formats or rare entries)
75
-
76
- Get started by uploading your own dataset or trying our sample one.
77
  """)
78
 
79
- # ---------------- DATA SELECTION ----------------
 
 
 
80
  df = None
81
- col1, col2 = st.columns(2)
82
- with col1:
83
- use_uploaded = st.button("πŸ“ Upload your own file")
84
- with col2:
85
- use_sample = st.button("πŸ“Š Use sample dataset")
86
-
87
- if use_uploaded:
88
- uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
89
- if uploaded_file:
90
- try:
91
- df = pd.read_csv(uploaded_file)
92
- st.success("File uploaded successfully.")
93
- except Exception as e:
94
- st.error(f"Could not read uploaded CSV. Error: {e}")
95
- elif use_sample:
96
- sample_path = "src/df_crypto.csv"
97
- try:
98
- df = pd.read_csv(sample_path)
99
- st.success("Sample dataset loaded from `src/df_crypto.csv`.")
100
- except Exception as e:
101
- st.error(f"Could not load sample dataset: {e}")
102
 
103
- # ---------------- MAIN ANALYSIS ----------------
104
  if df is not None:
105
- st.subheader("πŸ” Dataset Preview")
106
  st.dataframe(df, use_container_width=True)
107
 
108
- # --- Isolation Forest ---
109
- st.subheader("πŸ›‘οΈ Isolation Forest Results")
110
- numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
111
- if len(numeric_cols) > 0:
112
- iso_forest = IsolationForest(contamination=0.05, random_state=42)
113
- df_numeric = df[numeric_cols].dropna()
114
- iso_preds = iso_forest.fit_predict(df_numeric)
115
- anomalies_df = df_numeric[iso_preds == -1]
116
-
117
- st.write(f"Found {len(anomalies_df)} anomalies based on numerical features.")
118
- st.dataframe(anomalies_df, use_container_width=True)
119
- else:
120
- st.warning("No numeric columns found for Isolation Forest.")
 
 
 
 
121
 
122
- # --- LLM-Based Single Column Analysis ---
123
- st.subheader("πŸ€– LLM-Based Single Column Analysis")
124
- selected_column = st.selectbox("Select a column to analyze for anomalies:", df.columns)
125
 
126
  if st.button("Run LLM Anomaly Detection"):
127
- with st.spinner("Analyzing with LLM..."):
128
- values = df[selected_column].dropna().tolist()[:500] # Trim to token budget
129
- value_list_with_index = [
130
- {"index": idx, "value": str(val)} for idx, val in enumerate(values)
131
- ]
132
 
133
  prompt = PROMPT_INSTRUCTIONS_TEXT + "\n\nVALUES:\n" + json.dumps(value_list_with_index, indent=2)
134
  result = query_openai(prompt)
135
 
136
  if "anomalies" in result:
137
- st.success(f"Found {len(result['anomalies'])} anomalies in column `{selected_column}`.")
138
  st.dataframe(pd.json_normalize(result["anomalies"]), use_container_width=True)
139
  else:
140
- st.warning("No anomalies found or the model response was invalid.")
141
  st.subheader("Raw Model Output")
142
  st.json(result)
143
  else:
144
- st.info("Please upload a file or use the sample dataset to begin.")
 
8
  import streamlit as st
9
  import pandas as pd
10
  import json
 
11
  from openai import OpenAI
12
+ from sklearn.ensemble import IsolationForest
13
 
14
  # Initialize OpenAI client
15
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
51
  max_tokens=2048
52
  )
53
  raw_output = response.choices[0].message.content
54
+ print("πŸ”΅ RAW OUTPUT:\n", raw_output)
55
 
56
  json_start = raw_output.find("{")
57
  json_end = raw_output.rfind("}")
58
  if json_start != -1 and json_end != -1:
59
+ json_str = raw_output[json_start:json_end + 1]
60
+ return json.loads(json_str)
61
 
62
  return {"error": "Could not locate JSON structure in LLM response."}
63
  except json.JSONDecodeError as e:
64
+ return {"error": f"Failed to parse JSON: {str(e)}"}
65
  except Exception as e:
66
  return {"error": str(e)}
67
 
68
+ st.set_page_config(page_title="LLM Financial Anomaly Detector", layout="wide")
 
 
69
 
70
+ st.title("LLM-Assisted Financial Anomaly Detector")
71
  st.markdown("""
72
+ This app helps you detect unusual transactions in financial datasets. First, it applies an **Isolation Forest** model to highlight statistical anomalies.
73
+ Then, you can choose a specific column and let an **LLM** (Large Language Model) inspect the values and report unusual entries based on format, rarity, or inconsistency.
 
 
 
74
  """)
75
 
76
+ use_sample = st.button("Use Sample Dataset")
77
+
78
+ uploaded_file = st.file_uploader("Or upload your own CSV file", type=["csv"])
79
+
80
  df = None
81
+ if use_sample:
82
+ df = pd.read_csv("richer_500_crypto.csv")
83
+ elif uploaded_file:
84
+ df = pd.read_csv(uploaded_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
 
86
  if df is not None:
87
+ st.subheader("Dataset Preview")
88
  st.dataframe(df, use_container_width=True)
89
 
90
+ # Isolation Forest - simple numeric anomaly detection
91
+ st.subheader("Anomalies Detected with Isolation Forest")
92
+ try:
93
+ numeric_df = df.select_dtypes(include=["float64", "int64"]).dropna()
94
+ if not numeric_df.empty:
95
+ iso_model = IsolationForest(contamination=0.05, random_state=42)
96
+ preds = iso_model.fit_predict(numeric_df)
97
+ scores = iso_model.decision_function(numeric_df)
98
+
99
+ numeric_df["anomaly"] = preds
100
+ numeric_df["score"] = scores
101
+ st.write("Isolation Forest applied to numeric columns only:")
102
+ st.dataframe(numeric_df[numeric_df["anomaly"] == -1], use_container_width=True)
103
+ else:
104
+ st.warning("No numeric columns found to run Isolation Forest.")
105
+ except Exception as e:
106
+ st.error(f"Error running Isolation Forest: {e}")
107
 
108
+ # LLM-based anomaly detection
109
+ st.subheader("LLM-Based Column Anomaly Detection")
110
+ selected_column = st.selectbox("Select a column to analyze:", df.columns)
111
 
112
  if st.button("Run LLM Anomaly Detection"):
113
+ with st.spinner("Analyzing column with LLM..."):
114
+ values = df[selected_column].dropna().tolist()[:500]
115
+ value_list_with_index = [{"index": idx, "value": str(val)} for idx, val in enumerate(values)]
 
 
116
 
117
  prompt = PROMPT_INSTRUCTIONS_TEXT + "\n\nVALUES:\n" + json.dumps(value_list_with_index, indent=2)
118
  result = query_openai(prompt)
119
 
120
  if "anomalies" in result:
121
+ st.success(f"Found {len(result['anomalies'])} anomalies in `{selected_column}`.")
122
  st.dataframe(pd.json_normalize(result["anomalies"]), use_container_width=True)
123
  else:
124
+ st.warning("No anomalies found or LLM response was invalid.")
125
  st.subheader("Raw Model Output")
126
  st.json(result)
127
  else:
128
+ st.info("Please load a dataset to begin.")