itsalissonsilva commited on
Commit
9dd1f1b
Β·
verified Β·
1 Parent(s): c4559e8

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +54 -58
src/streamlit_app.py CHANGED
@@ -8,9 +8,8 @@ os.environ["STREAMLIT_HOME"] = "/tmp"
8
  import streamlit as st
9
  import pandas as pd
10
  import json
11
- from openai import OpenAI
12
  from sklearn.ensemble import IsolationForest
13
- from sklearn.preprocessing import LabelEncoder
14
 
15
  # Initialize OpenAI client
16
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
@@ -52,13 +51,12 @@ def query_openai(prompt: str) -> dict:
52
  max_tokens=2048
53
  )
54
  raw_output = response.choices[0].message.content
55
- print("πŸ”΅ RAW OUTPUT:\n", raw_output)
56
 
57
  json_start = raw_output.find("{")
58
  json_end = raw_output.rfind("}")
59
  if json_start != -1 and json_end != -1:
60
- json_str = raw_output[json_start:json_end + 1]
61
- return json.loads(json_str)
62
 
63
  return {"error": "Could not locate JSON structure in LLM response."}
64
  except json.JSONDecodeError as e:
@@ -66,83 +64,81 @@ def query_openai(prompt: str) -> dict:
66
  except Exception as e:
67
  return {"error": str(e)}
68
 
69
- def apply_isolation_forest(df):
70
- df_encoded = df.copy()
71
- for col in df_encoded.select_dtypes(include=["object", "category"]).columns:
72
- df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col].astype(str))
73
-
74
- try:
75
- model = IsolationForest(contamination=0.05, random_state=42)
76
- df_encoded = df_encoded.dropna()
77
- preds = model.fit_predict(df_encoded)
78
- scores = model.decision_function(df_encoded)
79
-
80
- result_df = df.loc[df_encoded.index].copy()
81
- result_df["IForest_Score"] = scores
82
- result_df["Anomaly"] = ["Yes" if p == -1 else "No" for p in preds]
83
- return result_df
84
- except Exception as e:
85
- st.error(f"Isolation Forest failed: {e}")
86
- return None
87
-
88
- # ---------------- Streamlit UI ----------------
89
  st.set_page_config(page_title="LLM-Assisted Anomaly Detector", layout="wide")
90
  st.title("🧠 LLM-Assisted + πŸ›‘οΈ Isolation Forest Anomaly Detector")
91
 
92
- use_sample = st.checkbox("Use built-in sample dataset (df_crypto.csv)?", value=False)
93
- df = None
 
 
94
 
95
- if use_sample:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  sample_path = "src/df_crypto.csv"
97
  try:
98
  df = pd.read_csv(sample_path)
99
  st.success("Sample dataset loaded from `src/df_crypto.csv`.")
100
  except Exception as e:
101
  st.error(f"Could not load sample dataset: {e}")
102
- else:
103
- uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
104
- if uploaded_file:
105
- try:
106
- df = pd.read_csv(uploaded_file)
107
- except Exception as e:
108
- st.error(f"Could not read uploaded CSV. Error: {e}")
109
 
 
110
  if df is not None:
111
- st.subheader("Full Dataset")
112
  st.dataframe(df, use_container_width=True)
113
 
114
- # ---------------- Isolation Forest ----------------
115
- st.markdown("### πŸ›‘οΈ Anomaly Detection with Isolation Forest (whole dataset)")
116
- iforest_df = apply_isolation_forest(df)
117
-
118
- if iforest_df is not None:
119
- st.success("Isolation Forest analysis completed.")
120
- st.dataframe(iforest_df[iforest_df["Anomaly"] == "Yes"], use_container_width=True)
121
-
122
- # ---------------- LLM Section ----------------
123
- st.markdown("### πŸ” LLM-Based Anomaly Detection (specific column)")
124
-
125
- selected_column = st.selectbox("Select a column to analyze with LLM:", df.columns)
126
-
127
- if st.button("Run LLM Anomaly Detection on selected column"):
128
- with st.spinner("Analyzing column with LLM..."):
129
- values = df[selected_column].dropna().tolist()
130
- values = values[:500] # keep within token limits
131
-
 
 
 
132
  value_list_with_index = [
133
  {"index": idx, "value": str(val)} for idx, val in enumerate(values)
134
  ]
135
 
136
  prompt = PROMPT_INSTRUCTIONS_TEXT + "\n\nVALUES:\n" + json.dumps(value_list_with_index, indent=2)
137
-
138
  result = query_openai(prompt)
139
 
140
  if "anomalies" in result:
141
- st.success(f"LLM found {len(result['anomalies'])} anomalies in `{selected_column}`.")
142
  st.dataframe(pd.json_normalize(result["anomalies"]), use_container_width=True)
143
  else:
144
- st.warning("No anomalies found or invalid response from LLM.")
145
  st.subheader("Raw Model Output")
146
  st.json(result)
147
  else:
148
- st.info("Please upload a CSV or use the sample dataset.")
 
8
  import streamlit as st
9
  import pandas as pd
10
  import json
 
11
  from sklearn.ensemble import IsolationForest
12
+ from openai import OpenAI
13
 
14
  # Initialize OpenAI client
15
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
51
  max_tokens=2048
52
  )
53
  raw_output = response.choices[0].message.content
54
+ print("\nπŸ”΅ RAW OUTPUT:\n", raw_output)
55
 
56
  json_start = raw_output.find("{")
57
  json_end = raw_output.rfind("}")
58
  if json_start != -1 and json_end != -1:
59
+ return json.loads(raw_output[json_start:json_end+1])
 
60
 
61
  return {"error": "Could not locate JSON structure in LLM response."}
62
  except json.JSONDecodeError as e:
 
64
  except Exception as e:
65
  return {"error": str(e)}
66
 
67
+ # ---------------- UI HEADER ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  st.set_page_config(page_title="LLM-Assisted Anomaly Detector", layout="wide")
69
  st.title("🧠 LLM-Assisted + πŸ›‘οΈ Isolation Forest Anomaly Detector")
70
 
71
+ st.markdown("""
72
+ Welcome! This app combines two anomaly detection approaches:
73
+ - πŸ›‘οΈ **Isolation Forest** to flag numeric and structural outliers across the whole dataset
74
+ - πŸ€– **LLM Analysis** to detect unusual values in a **single column** (like odd formats or rare entries)
75
 
76
+ Get started by uploading your own dataset or trying our sample one.
77
+ """)
78
+
79
+ # ---------------- DATA SELECTION ----------------
80
+ df = None
81
+ col1, col2 = st.columns(2)
82
+ with col1:
83
+ use_uploaded = st.button("πŸ“ Upload your own file")
84
+ with col2:
85
+ use_sample = st.button("πŸ“Š Use sample dataset")
86
+
87
+ if use_uploaded:
88
+ uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
89
+ if uploaded_file:
90
+ try:
91
+ df = pd.read_csv(uploaded_file)
92
+ st.success("File uploaded successfully.")
93
+ except Exception as e:
94
+ st.error(f"Could not read uploaded CSV. Error: {e}")
95
+ elif use_sample:
96
  sample_path = "src/df_crypto.csv"
97
  try:
98
  df = pd.read_csv(sample_path)
99
  st.success("Sample dataset loaded from `src/df_crypto.csv`.")
100
  except Exception as e:
101
  st.error(f"Could not load sample dataset: {e}")
 
 
 
 
 
 
 
102
 
103
+ # ---------------- MAIN ANALYSIS ----------------
104
  if df is not None:
105
+ st.subheader("πŸ” Dataset Preview")
106
  st.dataframe(df, use_container_width=True)
107
 
108
+ # --- Isolation Forest ---
109
+ st.subheader("πŸ›‘οΈ Isolation Forest Results")
110
+ numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
111
+ if len(numeric_cols) > 0:
112
+ iso_forest = IsolationForest(contamination=0.05, random_state=42)
113
+ df_numeric = df[numeric_cols].dropna()
114
+ iso_preds = iso_forest.fit_predict(df_numeric)
115
+ anomalies_df = df_numeric[iso_preds == -1]
116
+
117
+ st.write(f"Found {len(anomalies_df)} anomalies based on numerical features.")
118
+ st.dataframe(anomalies_df, use_container_width=True)
119
+ else:
120
+ st.warning("No numeric columns found for Isolation Forest.")
121
+
122
+ # --- LLM-Based Single Column Analysis ---
123
+ st.subheader("πŸ€– LLM-Based Single Column Analysis")
124
+ selected_column = st.selectbox("Select a column to analyze for anomalies:", df.columns)
125
+
126
+ if st.button("Run LLM Anomaly Detection"):
127
+ with st.spinner("Analyzing with LLM..."):
128
+ values = df[selected_column].dropna().tolist()[:500] # Trim to token budget
129
  value_list_with_index = [
130
  {"index": idx, "value": str(val)} for idx, val in enumerate(values)
131
  ]
132
 
133
  prompt = PROMPT_INSTRUCTIONS_TEXT + "\n\nVALUES:\n" + json.dumps(value_list_with_index, indent=2)
 
134
  result = query_openai(prompt)
135
 
136
  if "anomalies" in result:
137
+ st.success(f"Found {len(result['anomalies'])} anomalies in column `{selected_column}`.")
138
  st.dataframe(pd.json_normalize(result["anomalies"]), use_container_width=True)
139
  else:
140
+ st.warning("No anomalies found or the model response was invalid.")
141
  st.subheader("Raw Model Output")
142
  st.json(result)
143
  else:
144
+ st.info("Please upload a file or use the sample dataset to begin.")