Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +44 -60
src/streamlit_app.py
CHANGED
|
@@ -8,8 +8,8 @@ os.environ["STREAMLIT_HOME"] = "/tmp"
|
|
| 8 |
import streamlit as st
|
| 9 |
import pandas as pd
|
| 10 |
import json
|
| 11 |
-
from sklearn.ensemble import IsolationForest
|
| 12 |
from openai import OpenAI
|
|
|
|
| 13 |
|
| 14 |
# Initialize OpenAI client
|
| 15 |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
@@ -51,94 +51,78 @@ def query_openai(prompt: str) -> dict:
|
|
| 51 |
max_tokens=2048
|
| 52 |
)
|
| 53 |
raw_output = response.choices[0].message.content
|
| 54 |
-
print("
|
| 55 |
|
| 56 |
json_start = raw_output.find("{")
|
| 57 |
json_end = raw_output.rfind("}")
|
| 58 |
if json_start != -1 and json_end != -1:
|
| 59 |
-
|
|
|
|
| 60 |
|
| 61 |
return {"error": "Could not locate JSON structure in LLM response."}
|
| 62 |
except json.JSONDecodeError as e:
|
| 63 |
-
return {"error": f"Failed to parse JSON: {str(e)}"}
|
| 64 |
except Exception as e:
|
| 65 |
return {"error": str(e)}
|
| 66 |
|
| 67 |
-
|
| 68 |
-
st.set_page_config(page_title="LLM-Assisted Anomaly Detector", layout="wide")
|
| 69 |
-
st.title("π§ LLM-Assisted + π‘οΈ Isolation Forest Anomaly Detector")
|
| 70 |
|
|
|
|
| 71 |
st.markdown("""
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
- π€ **LLM Analysis** to detect unusual values in a **single column** (like odd formats or rare entries)
|
| 75 |
-
|
| 76 |
-
Get started by uploading your own dataset or trying our sample one.
|
| 77 |
""")
|
| 78 |
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
| 80 |
df = None
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
use_sample = st.button("π Use sample dataset")
|
| 86 |
-
|
| 87 |
-
if use_uploaded:
|
| 88 |
-
uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
|
| 89 |
-
if uploaded_file:
|
| 90 |
-
try:
|
| 91 |
-
df = pd.read_csv(uploaded_file)
|
| 92 |
-
st.success("File uploaded successfully.")
|
| 93 |
-
except Exception as e:
|
| 94 |
-
st.error(f"Could not read uploaded CSV. Error: {e}")
|
| 95 |
-
elif use_sample:
|
| 96 |
-
sample_path = "src/df_crypto.csv"
|
| 97 |
-
try:
|
| 98 |
-
df = pd.read_csv(sample_path)
|
| 99 |
-
st.success("Sample dataset loaded from `src/df_crypto.csv`.")
|
| 100 |
-
except Exception as e:
|
| 101 |
-
st.error(f"Could not load sample dataset: {e}")
|
| 102 |
|
| 103 |
-
# ---------------- MAIN ANALYSIS ----------------
|
| 104 |
if df is not None:
|
| 105 |
-
st.subheader("
|
| 106 |
st.dataframe(df, use_container_width=True)
|
| 107 |
|
| 108 |
-
#
|
| 109 |
-
st.subheader("
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
-
#
|
| 123 |
-
st.subheader("
|
| 124 |
-
selected_column = st.selectbox("Select a column to analyze
|
| 125 |
|
| 126 |
if st.button("Run LLM Anomaly Detection"):
|
| 127 |
-
with st.spinner("Analyzing with LLM..."):
|
| 128 |
-
values = df[selected_column].dropna().tolist()[:500]
|
| 129 |
-
value_list_with_index = [
|
| 130 |
-
{"index": idx, "value": str(val)} for idx, val in enumerate(values)
|
| 131 |
-
]
|
| 132 |
|
| 133 |
prompt = PROMPT_INSTRUCTIONS_TEXT + "\n\nVALUES:\n" + json.dumps(value_list_with_index, indent=2)
|
| 134 |
result = query_openai(prompt)
|
| 135 |
|
| 136 |
if "anomalies" in result:
|
| 137 |
-
st.success(f"Found {len(result['anomalies'])} anomalies in
|
| 138 |
st.dataframe(pd.json_normalize(result["anomalies"]), use_container_width=True)
|
| 139 |
else:
|
| 140 |
-
st.warning("No anomalies found or
|
| 141 |
st.subheader("Raw Model Output")
|
| 142 |
st.json(result)
|
| 143 |
else:
|
| 144 |
-
st.info("Please
|
|
|
|
| 8 |
import streamlit as st
|
| 9 |
import pandas as pd
|
| 10 |
import json
|
|
|
|
| 11 |
from openai import OpenAI
|
| 12 |
+
from sklearn.ensemble import IsolationForest
|
| 13 |
|
| 14 |
# Initialize OpenAI client
|
| 15 |
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
|
|
| 51 |
max_tokens=2048
|
| 52 |
)
|
| 53 |
raw_output = response.choices[0].message.content
|
| 54 |
+
print("π΅ RAW OUTPUT:\n", raw_output)
|
| 55 |
|
| 56 |
json_start = raw_output.find("{")
|
| 57 |
json_end = raw_output.rfind("}")
|
| 58 |
if json_start != -1 and json_end != -1:
|
| 59 |
+
json_str = raw_output[json_start:json_end + 1]
|
| 60 |
+
return json.loads(json_str)
|
| 61 |
|
| 62 |
return {"error": "Could not locate JSON structure in LLM response."}
|
| 63 |
except json.JSONDecodeError as e:
|
| 64 |
+
return {"error": f"Failed to parse JSON: {str(e)}"}
|
| 65 |
except Exception as e:
|
| 66 |
return {"error": str(e)}
|
| 67 |
|
| 68 |
+
st.set_page_config(page_title="LLM Financial Anomaly Detector", layout="wide")
|
|
|
|
|
|
|
| 69 |
|
| 70 |
+
st.title("LLM-Assisted Financial Anomaly Detector")
|
| 71 |
st.markdown("""
|
| 72 |
+
This app helps you detect unusual transactions in financial datasets. First, it applies an **Isolation Forest** model to highlight statistical anomalies.
|
| 73 |
+
Then, you can choose a specific column and let an **LLM** (Large Language Model) inspect the values and report unusual entries based on format, rarity, or inconsistency.
|
|
|
|
|
|
|
|
|
|
| 74 |
""")
|
| 75 |
|
| 76 |
+
use_sample = st.button("Use Sample Dataset")
|
| 77 |
+
|
| 78 |
+
uploaded_file = st.file_uploader("Or upload your own CSV file", type=["csv"])
|
| 79 |
+
|
| 80 |
df = None
|
| 81 |
+
if use_sample:
|
| 82 |
+
df = pd.read_csv("richer_500_crypto.csv")
|
| 83 |
+
elif uploaded_file:
|
| 84 |
+
df = pd.read_csv(uploaded_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
|
|
|
| 86 |
if df is not None:
|
| 87 |
+
st.subheader("Dataset Preview")
|
| 88 |
st.dataframe(df, use_container_width=True)
|
| 89 |
|
| 90 |
+
# Isolation Forest - simple numeric anomaly detection
|
| 91 |
+
st.subheader("Anomalies Detected with Isolation Forest")
|
| 92 |
+
try:
|
| 93 |
+
numeric_df = df.select_dtypes(include=["float64", "int64"]).dropna()
|
| 94 |
+
if not numeric_df.empty:
|
| 95 |
+
iso_model = IsolationForest(contamination=0.05, random_state=42)
|
| 96 |
+
preds = iso_model.fit_predict(numeric_df)
|
| 97 |
+
scores = iso_model.decision_function(numeric_df)
|
| 98 |
+
|
| 99 |
+
numeric_df["anomaly"] = preds
|
| 100 |
+
numeric_df["score"] = scores
|
| 101 |
+
st.write("Isolation Forest applied to numeric columns only:")
|
| 102 |
+
st.dataframe(numeric_df[numeric_df["anomaly"] == -1], use_container_width=True)
|
| 103 |
+
else:
|
| 104 |
+
st.warning("No numeric columns found to run Isolation Forest.")
|
| 105 |
+
except Exception as e:
|
| 106 |
+
st.error(f"Error running Isolation Forest: {e}")
|
| 107 |
|
| 108 |
+
# LLM-based anomaly detection
|
| 109 |
+
st.subheader("LLM-Based Column Anomaly Detection")
|
| 110 |
+
selected_column = st.selectbox("Select a column to analyze:", df.columns)
|
| 111 |
|
| 112 |
if st.button("Run LLM Anomaly Detection"):
|
| 113 |
+
with st.spinner("Analyzing column with LLM..."):
|
| 114 |
+
values = df[selected_column].dropna().tolist()[:500]
|
| 115 |
+
value_list_with_index = [{"index": idx, "value": str(val)} for idx, val in enumerate(values)]
|
|
|
|
|
|
|
| 116 |
|
| 117 |
prompt = PROMPT_INSTRUCTIONS_TEXT + "\n\nVALUES:\n" + json.dumps(value_list_with_index, indent=2)
|
| 118 |
result = query_openai(prompt)
|
| 119 |
|
| 120 |
if "anomalies" in result:
|
| 121 |
+
st.success(f"Found {len(result['anomalies'])} anomalies in `{selected_column}`.")
|
| 122 |
st.dataframe(pd.json_normalize(result["anomalies"]), use_container_width=True)
|
| 123 |
else:
|
| 124 |
+
st.warning("No anomalies found or LLM response was invalid.")
|
| 125 |
st.subheader("Raw Model Output")
|
| 126 |
st.json(result)
|
| 127 |
else:
|
| 128 |
+
st.info("Please load a dataset to begin.")
|