Spaces:

itsalissonsilva
/

test

Sleeping

App Files Files Community

itsalissonsilva commited on Jun 10, 2025

Commit

9dd1f1b

verified ·

1 Parent(s): c4559e8

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +54 -58

src/streamlit_app.py CHANGED Viewed

@@ -8,9 +8,8 @@ os.environ["STREAMLIT_HOME"] = "/tmp"
 import streamlit as st
 import pandas as pd
 import json
-from openai import OpenAI
 from sklearn.ensemble import IsolationForest
-from sklearn.preprocessing import LabelEncoder
 # Initialize OpenAI client
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
@@ -52,13 +51,12 @@ def query_openai(prompt: str) -> dict:
             max_tokens=2048
         )
         raw_output = response.choices[0].message.content
-        print("🔵 RAW OUTPUT:\n", raw_output)
         json_start = raw_output.find("{")
         json_end = raw_output.rfind("}")
         if json_start != -1 and json_end != -1:
-            json_str = raw_output[json_start:json_end + 1]
-            return json.loads(json_str)
         return {"error": "Could not locate JSON structure in LLM response."}
     except json.JSONDecodeError as e:
@@ -66,83 +64,81 @@ def query_openai(prompt: str) -> dict:
     except Exception as e:
         return {"error": str(e)}
-def apply_isolation_forest(df):
-    df_encoded = df.copy()
-    for col in df_encoded.select_dtypes(include=["object", "category"]).columns:
-        df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col].astype(str))
-    try:
-        model = IsolationForest(contamination=0.05, random_state=42)
-        df_encoded = df_encoded.dropna()
-        preds = model.fit_predict(df_encoded)
-        scores = model.decision_function(df_encoded)
-        result_df = df.loc[df_encoded.index].copy()
-        result_df["IForest_Score"] = scores
-        result_df["Anomaly"] = ["Yes" if p == -1 else "No" for p in preds]
-        return result_df
-    except Exception as e:
-        st.error(f"Isolation Forest failed: {e}")
-        return None
-# ---------------- Streamlit UI ----------------
 st.set_page_config(page_title="LLM-Assisted Anomaly Detector", layout="wide")
 st.title("🧠 LLM-Assisted + 🛡️ Isolation Forest Anomaly Detector")
-use_sample = st.checkbox("Use built-in sample dataset (df_crypto.csv)?", value=False)
-df = None
-if use_sample:
     sample_path = "src/df_crypto.csv"
     try:
         df = pd.read_csv(sample_path)
         st.success("Sample dataset loaded from `src/df_crypto.csv`.")
     except Exception as e:
         st.error(f"Could not load sample dataset: {e}")
-else:
-    uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])
-    if uploaded_file:
-        try:
-            df = pd.read_csv(uploaded_file)
-        except Exception as e:
-            st.error(f"Could not read uploaded CSV. Error: {e}")
 if df is not None:
-    st.subheader("Full Dataset")
     st.dataframe(df, use_container_width=True)
-    # ---------------- Isolation Forest ----------------
-    st.markdown("### 🛡️ Anomaly Detection with Isolation Forest (whole dataset)")
-    iforest_df = apply_isolation_forest(df)
-    if iforest_df is not None:
-        st.success("Isolation Forest analysis completed.")
-        st.dataframe(iforest_df[iforest_df["Anomaly"] == "Yes"], use_container_width=True)
-    # ---------------- LLM Section ----------------
-    st.markdown("### 🔍 LLM-Based Anomaly Detection (specific column)")
-    selected_column = st.selectbox("Select a column to analyze with LLM:", df.columns)
-    if st.button("Run LLM Anomaly Detection on selected column"):
-        with st.spinner("Analyzing column with LLM..."):
-            values = df[selected_column].dropna().tolist()
-            values = values[:500]  # keep within token limits
             value_list_with_index = [
                 {"index": idx, "value": str(val)} for idx, val in enumerate(values)
             ]
             prompt = PROMPT_INSTRUCTIONS_TEXT + "\n\nVALUES:\n" + json.dumps(value_list_with_index, indent=2)
             result = query_openai(prompt)
             if "anomalies" in result:
-                st.success(f"LLM found {len(result['anomalies'])} anomalies in `{selected_column}`.")
                 st.dataframe(pd.json_normalize(result["anomalies"]), use_container_width=True)
             else:
-                st.warning("No anomalies found or invalid response from LLM.")
                 st.subheader("Raw Model Output")
                 st.json(result)
 else:
-    st.info("Please upload a CSV or use the sample dataset.")

 import streamlit as st
 import pandas as pd
 import json
 from sklearn.ensemble import IsolationForest
+from openai import OpenAI
 # Initialize OpenAI client
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
             max_tokens=2048
         )
         raw_output = response.choices[0].message.content
+        print("\n🔵 RAW OUTPUT:\n", raw_output)
         json_start = raw_output.find("{")
         json_end = raw_output.rfind("}")
         if json_start != -1 and json_end != -1:
+            return json.loads(raw_output[json_start:json_end+1])
         return {"error": "Could not locate JSON structure in LLM response."}
     except json.JSONDecodeError as e:
     except Exception as e:
         return {"error": str(e)}
+# ---------------- UI HEADER ----------------
 st.set_page_config(page_title="LLM-Assisted Anomaly Detector", layout="wide")
 st.title("🧠 LLM-Assisted + 🛡️ Isolation Forest Anomaly Detector")
+st.markdown("""
+Welcome! This app combines two anomaly detection approaches:
+- 🛡️ **Isolation Forest** to flag numeric and structural outliers across the whole dataset
+- 🤖 **LLM Analysis** to detect unusual values in a **single column** (like odd formats or rare entries)
+Get started by uploading your own dataset or trying our sample one.
+""")
+# ---------------- DATA SELECTION ----------------
+df = None
+col1, col2 = st.columns(2)
+with col1:
+    use_uploaded = st.button("📁 Upload your own file")
+with col2:
+    use_sample = st.button("📊 Use sample dataset")
+if use_uploaded:
+    uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
+    if uploaded_file:
+        try:
+            df = pd.read_csv(uploaded_file)
+            st.success("File uploaded successfully.")
+        except Exception as e:
+            st.error(f"Could not read uploaded CSV. Error: {e}")
+elif use_sample:
     sample_path = "src/df_crypto.csv"
     try:
         df = pd.read_csv(sample_path)
         st.success("Sample dataset loaded from `src/df_crypto.csv`.")
     except Exception as e:
         st.error(f"Could not load sample dataset: {e}")
+# ---------------- MAIN ANALYSIS ----------------
 if df is not None:
+    st.subheader("🔍 Dataset Preview")
     st.dataframe(df, use_container_width=True)
+    # --- Isolation Forest ---
+    st.subheader("🛡️ Isolation Forest Results")
+    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
+    if len(numeric_cols) > 0:
+        iso_forest = IsolationForest(contamination=0.05, random_state=42)
+        df_numeric = df[numeric_cols].dropna()
+        iso_preds = iso_forest.fit_predict(df_numeric)
+        anomalies_df = df_numeric[iso_preds == -1]
+        st.write(f"Found {len(anomalies_df)} anomalies based on numerical features.")
+        st.dataframe(anomalies_df, use_container_width=True)
+    else:
+        st.warning("No numeric columns found for Isolation Forest.")
+    # --- LLM-Based Single Column Analysis ---
+    st.subheader("🤖 LLM-Based Single Column Analysis")
+    selected_column = st.selectbox("Select a column to analyze for anomalies:", df.columns)
+    if st.button("Run LLM Anomaly Detection"):
+        with st.spinner("Analyzing with LLM..."):
+            values = df[selected_column].dropna().tolist()[:500]  # Trim to token budget
             value_list_with_index = [
                 {"index": idx, "value": str(val)} for idx, val in enumerate(values)
             ]
             prompt = PROMPT_INSTRUCTIONS_TEXT + "\n\nVALUES:\n" + json.dumps(value_list_with_index, indent=2)
             result = query_openai(prompt)
             if "anomalies" in result:
+                st.success(f"Found {len(result['anomalies'])} anomalies in column `{selected_column}`.")
                 st.dataframe(pd.json_normalize(result["anomalies"]), use_container_width=True)
             else:
+                st.warning("No anomalies found or the model response was invalid.")
                 st.subheader("Raw Model Output")
                 st.json(result)
 else:
+    st.info("Please upload a file or use the sample dataset to begin.")