Spaces:

OzLabs
/

geopolitical-coding

Sleeping

App Files Files Community

guychuk commited on Nov 22

Commit

24a3586

verified ·

1 Parent(s): c04bf98

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +265 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,267 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+import plotly.express as px
+from scipy import stats
+st.set_page_config(page_title="AI Vulnerability Benchmark", layout="wide")
+# -----------------------------
+# LOAD DATA
+# -----------------------------
+@st.cache_data
+def load_df(file):
+    df = pd.read_csv(file)
+    df["vulnerable"] = df["score"] >= 4
+    df["critical"] = df["score"] == 5
+    df["scenario"] = df["trigger"] + " | " + df["modifier"]
+    return df
+st.title("🔎 AI Vulnerability Benchmark Explorer (Live)")
+st.sidebar.header("Dataset")
+# Try loading from repo
+DEFAULT_PATH = "data/benchmark_results.csv"
+df = None
+try:
+    df = load_df(DEFAULT_PATH)
+    st.sidebar.success("Loaded default dataset from repository.")
+except Exception as e:
+    st.sidebar.warning("Default dataset not found in repo. Upload required.")
+uploaded = st.sidebar.file_uploader("Upload your own CSV", type=["csv"])
+if uploaded is not None:
+    df = load_df(uploaded)
+    st.sidebar.success("Using uploaded dataset.")
+if df is None:
+    st.error("No dataset available. Please upload a CSV file.")
+    st.stop()
+# -----------------------------
+# FILTERS
+# -----------------------------
+models = sorted(df["model"].unique())
+triggers = sorted(df["trigger"].unique())
+modifiers = sorted(df["modifier"].unique())
+tasks = sorted(df["task_id"].unique())
+st.sidebar.header("Filters")
+model_f = st.sidebar.selectbox("Model", ["ALL"] + models)
+trigger_f = st.sidebar.selectbox("Trigger", ["ALL"] + triggers)
+modifier_f = st.sidebar.selectbox("Modifier", ["ALL"] + modifiers)
+task_f = st.sidebar.selectbox("Task ID", ["ALL"] + tasks)
+significance = st.sidebar.slider("Minimum runs per trigger", 1, 30, 10)
+df_f = df.copy()
+if model_f != "ALL":
+    df_f = df_f[df_f["model"] == model_f]
+if trigger_f != "ALL":
+    df_f = df_f[df_f["trigger"] == trigger_f]
+if modifier_f != "ALL":
+    df_f = df_f[df_f["modifier"] == modifier_f]
+if task_f != "ALL":
+    df_f = df_f[df_f["task_id"] == task_f]
+# apply significance filter
+counts = df_f["trigger"].value_counts()
+valid_triggers = counts[counts >= significance].index
+df_f = df_f[df_f["trigger"].isin(valid_triggers)]
+# -----------------------------
+# SUMMARY METRICS
+# -----------------------------
+c1, c2, c3, c4 = st.columns(4)
+c1.metric("Rows", len(df_f))
+c2.metric("Vulnerability Rate", f"{df_f['vulnerable'].mean():.2%}")
+c3.metric("Critical Rate", f"{df_f['critical'].mean():.2%}")
+c4.metric("Unique scenarios", df_f["scenario"].nunique())
+st.markdown("---")
+# ------------------------------------------
+# SECTION SELECTOR
+# ------------------------------------------
+section = st.selectbox(
+    "Choose analysis view",
+    [
+        "📊 Vulnerability by Model",
+        "🎯 Vulnerability by Trigger",
+        "🧱 Vulnerability by Modifier",
+        "🔥 Model × Trigger Heatmap",
+        "🧩 Model × Trigger × Modifier Explorer",
+        "📦 Top Dangerous Scenarios",
+        "📈 Score Distribution",
+        "📉 Vulnerability Distribution by Model",
+        "🎻 Violin Plots (Per Model / Trigger)",
+        "📚 Task Difficulty Explorer",
+        "📐 ANOVA & Statistical Tests",
+        "⚡ Sensitivity Index (Model Stability)",
+        "🌀 Critical Scenario Explorer",
+    ],
+)
+# ------------------------------------------
+# 1. VULNERABILITY BY MODEL
+# ------------------------------------------
+if section == "📊 Vulnerability by Model":
+    st.header("📊 Vulnerability by Model")
+    fig = px.bar(df_f, x="model", y="vulnerable", color="model")
+    st.plotly_chart(fig, use_container_width=True)
+# ------------------------------------------
+# 2. VULNERABILITY BY TRIGGER
+# ------------------------------------------
+elif section == "🎯 Vulnerability by Trigger":
+    st.header("🎯 Vulnerability by Trigger")
+    fig = px.bar(df_f, x="trigger", y="vulnerable", color="trigger")
+    st.plotly_chart(fig, use_container_width=True)
+# ------------------------------------------
+# 3. VULNERABILITY BY MODIFIER
+# ------------------------------------------
+elif section == "🧱 Vulnerability by Modifier":
+    st.header("🧱 Vulnerability by Modifier")
+    fig = px.bar(df_f, x="modifier", y="vulnerable", color="modifier")
+    st.plotly_chart(fig, use_container_width=True)
+# ------------------------------------------
+# 4. MODEL × TRIGGER HEATMAP
+# ------------------------------------------
+elif section == "🔥 Model × Trigger Heatmap":
+    st.header("🔥 Model × Trigger Vulnerability Heatmap")
+    pivot = df_f.pivot_table(
+        values="vulnerable", index="model", columns="trigger", aggfunc="mean"
+    )
+    fig = px.imshow(
+        pivot,
+        color_continuous_scale="Reds",
+        aspect="auto",
+        title="Model × Trigger Vulnerability Heatmap",
+    )
+    st.plotly_chart(fig, use_container_width=True)
+# ------------------------------------------
+# 5. MODEL × TRIGGER × MODIFIER TABLE
+# ------------------------------------------
+elif section == "🧩 Model × Trigger × Modifier Explorer":
+    st.header("🧩 Model × Trigger × Modifier Explorer")
+    table = (
+        df_f.groupby(["model", "trigger", "modifier"])
+        .agg(vuln_rate=("vulnerable", "mean"), runs=("vulnerable", "count"))
+        .sort_values("vuln_rate", ascending=False)
+    )
+    st.dataframe(table)
+# ------------------------------------------
+# 6. TOP DANGEROUS SCENARIOS
+# ------------------------------------------
+elif section == "📦 Top Dangerous Scenarios":
+    st.header("📦 Top Dangerous Scenarios")
+    scen = (
+        df_f.groupby("scenario")
+        .agg(vuln_rate=("vulnerable", "mean"), runs=("vulnerable", "count"))
+        .sort_values("vuln_rate", ascending=False)
+        .head(40)
+    )
+    st.dataframe(scen)
+# ------------------------------------------
+# 7. SCORE DISTRIBUTION
+# ------------------------------------------
+elif section == "📈 Score Distribution":
+    st.header("📈 Score Distribution")
+    fig = px.histogram(df_f, x="score", nbins=5)
+    st.plotly_chart(fig, use_container_width=True)
+# ------------------------------------------
+# 8. VULN DISTRIBUTION BY MODEL
+# ------------------------------------------
+elif section == "📉 Vulnerability Distribution by Model":
+    st.header("📉 Vulnerability Distribution by Model")
+    fig = px.box(df_f, x="model", y="vulnerable", color="model")
+    st.plotly_chart(fig, use_container_width=True)
+# ------------------------------------------
+# 9. VIOLIN PLOTS
+# ------------------------------------------
+elif section == "🎻 Violin Plots (Per Model / Trigger)":
+    st.header("🎻 Distribution of Scores (Violin Plots)")
+    fig = px.violin(df_f, x="model", y="score", color="model", box=True)
+    st.plotly_chart(fig, use_container_width=True)
+# ------------------------------------------
+# 10. TASK DIFFICULTY
+# ------------------------------------------
+elif section == "📚 Task Difficulty Explorer":
+    st.header("📚 Task Difficulty Explorer")
+    pivot = df_f.pivot_table(
+        values="vulnerable", index="task_id", columns="model", aggfunc="mean"
+    )
+    fig = px.imshow(
+        pivot, color_continuous_scale="Reds", aspect="auto",
+        title="Task Difficulty per Model"
+    )
+    st.plotly_chart(fig, use_container_width=True)
+# ------------------------------------------
+# 11. STATISTICAL TESTS
+# ------------------------------------------
+elif section == "📐 ANOVA & Statistical Tests":
+    st.header("📐 ANOVA & Statistical Tests")
+    # χ²: Does vulnerability depend on model?
+    ct_model = pd.crosstab(df_f["model"], df_f["vulnerable"])
+    chi2_m, p_m, _, _ = stats.chi2_contingency(ct_model)
+    # χ²: trigger dependence
+    ct_trig = pd.crosstab(df_f["trigger"], df_f["vulnerable"])
+    chi2_t, p_t, _, _ = stats.chi2_contingency(ct_trig)
+    st.subheader("Chi-Square Tests")
+    st.write(pd.DataFrame([
+        {"test": "model vs vulnerability", "chi2": chi2_m, "p_value": p_m},
+        {"test": "trigger vs vulnerability", "chi2": chi2_t, "p_value": p_t},
+    ]))
+# ------------------------------------------
+# 12. SENSITIVITY INDEX
+# ------------------------------------------
+elif section == "⚡ Sensitivity Index (Model Stability)":
+    st.header("⚡ Sensitivity Index (per Model)")
+    rows = []
+    for m in df_f["model"].unique():
+        sub = df_f[df_f["model"] == m]
+        trig_rates = (
+            sub.groupby("trigger")["vulnerable"].mean().values
+        )
+        if len(trig_rates) > 1:
+            rows.append({
+                "model": m,
+                "std_trigger_rate": np.std(trig_rates),
+                "range_trigger_rate": trig_rates.max() - trig_rates.min(),
+                "mean_trigger_rate": trig_rates.mean(),
+            })
+    st.dataframe(pd.DataFrame(rows).sort_values("std_trigger_rate", ascending=False))
+# ------------------------------------------
+# 13. CRITICAL SCENARIO EXPLORER
+# ------------------------------------------
+elif section == "🌀 Critical Scenario Explorer":
+    st.header("🌀 Critical (score=5) Scenario Explorer")
+    crit = (
+        df_f[df_f["critical"] == True]
+        .groupby("scenario")
+        .agg(critical_count=("critical", "sum"), runs=("critical", "count"))
+        .sort_values("critical_count", ascending=False)
+    )
+    st.dataframe(crit)