Initial
Browse files- .gitignore +2 -0
- .streamlit/config.toml +5 -0
- Benchmark.csv +24 -0
- app.py +87 -0
- requirements.txt +37 -0
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
venv
|
| 2 |
+
Dockerfile
|
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[theme]
|
| 2 |
+
primaryColor="#01d2fc"
|
| 3 |
+
backgroundColor="#252040"
|
| 4 |
+
secondaryBackgroundColor="#262626"
|
| 5 |
+
textColor="#f4f4f4"
|
Benchmark.csv
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model name, source, v1 metric, v2 metric
|
| 2 |
+
OpenAI: GPT-4.5 (Preview),Proprietary Model,100%,97%
|
| 3 |
+
OpenAI: o3 Mini High,Proprietary Model,100%,96%
|
| 4 |
+
OpenAI: o3 Mini,Proprietary Model,100%,96%
|
| 5 |
+
OpenAI: GPT-4o,Proprietary Model,99.09%,95%
|
| 6 |
+
OpenAI: GPT-4o-mini,Proprietary Model,99.09%,97%
|
| 7 |
+
Anthropic: Claude 3.5 Sonnet,Proprietary Model,99.09%,97%
|
| 8 |
+
Anthropic: Claude 3.5 Haiku,Proprietary Model,100%,97%
|
| 9 |
+
Anthropic: Claude 3.7 Sonnet,Proprietary Model,99.09%,98%
|
| 10 |
+
Google: Gemma 3 27B ,Open Source,98.18%,95%
|
| 11 |
+
Google: Gemini Flash 2.0,Proprietary Model,100%,99%
|
| 12 |
+
Google: Gemini 2.0 Flash Lite,Proprietary Model,100%,97%
|
| 13 |
+
DeepSeek: R1,Open Source,100%,98%
|
| 14 |
+
DeepSeek: DeepSeek V3,Open Source,100%,97%
|
| 15 |
+
Mistral: Mistral Small 3.1 24B,Open Source,100%,97%
|
| 16 |
+
Mistral: Mistral Small 3,Open Source,99.09%,97%
|
| 17 |
+
Mistral Large 2411,Open Source,99.09%,96%
|
| 18 |
+
Meta: Llama 3.3 70B Instruct,Open Source,100%,97%
|
| 19 |
+
Meta: Llama 3.2 3B Instruct,Open Source,78.18%,75%
|
| 20 |
+
Qwen: QwQ 32B,Open Source,100.00%,96%
|
| 21 |
+
Microsoft: Phi 4,Proprietary Model,100%,97%
|
| 22 |
+
Microsoft: Phi-3.5 Mini 128K Instruct,Open Source,99.09%,97%
|
| 23 |
+
Microsoft: Phi-3 Mini 128K Instruct,Open Source,98.18%,98%
|
| 24 |
+
|
app.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
# Set page configuration
|
| 5 |
+
st.set_page_config(page_title="Cyber Benchmark Hub: SECQA Leaderboard", layout="wide")
|
| 6 |
+
|
| 7 |
+
# Main Title (ensures it's displayed on the main page)
|
| 8 |
+
st.title("Cyber Benchmark Hub: SECQA Leaderboard")
|
| 9 |
+
st.markdown("## Powered by **Priam Cyber AI**")
|
| 10 |
+
st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
|
| 11 |
+
|
| 12 |
+
# Function to load and clean CSV data
|
| 13 |
+
@st.cache_data
|
| 14 |
+
def load_data(file_path):
|
| 15 |
+
df = pd.read_csv(file_path)
|
| 16 |
+
|
| 17 |
+
# Remove any unnamed columns (caused by trailing commas)
|
| 18 |
+
df = df.loc[:, ~df.columns.str.contains('Unnamed', na=False)]
|
| 19 |
+
|
| 20 |
+
# Standardize column names
|
| 21 |
+
df.columns = df.columns.str.strip()
|
| 22 |
+
df.rename(columns={
|
| 23 |
+
"model name": "Model",
|
| 24 |
+
"source": "Type",
|
| 25 |
+
"v1 metric": "V1 Accuracy",
|
| 26 |
+
"v2 metric": "V2 Accuracy"
|
| 27 |
+
}, inplace=True)
|
| 28 |
+
|
| 29 |
+
# Convert percentage strings to floats (e.g., "100%" → 1.0)
|
| 30 |
+
for col in ["V1 Accuracy", "V2 Accuracy"]:
|
| 31 |
+
df[col] = df[col].astype(str).str.replace("%", "").str.strip()
|
| 32 |
+
df[col] = pd.to_numeric(df[col], errors='coerce') / 100
|
| 33 |
+
|
| 34 |
+
return df
|
| 35 |
+
|
| 36 |
+
# Load dataset
|
| 37 |
+
file_path = "Benchmark.csv" # Ensure this file is uploaded in your Hugging Face Space
|
| 38 |
+
df = load_data(file_path)
|
| 39 |
+
|
| 40 |
+
# Sidebar: Logo, then Filters and Options
|
| 41 |
+
with st.sidebar:
|
| 42 |
+
st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg", use_container_width=True)
|
| 43 |
+
st.divider()
|
| 44 |
+
st.header("Filters & Options")
|
| 45 |
+
dataset_version = st.radio("Select Dataset Version", ["v1", "v2"])
|
| 46 |
+
source_filter = st.multiselect(
|
| 47 |
+
"Select Model Type",
|
| 48 |
+
options=df["Type"].unique().tolist(),
|
| 49 |
+
default=df["Type"].unique().tolist()
|
| 50 |
+
)
|
| 51 |
+
st.markdown("---")
|
| 52 |
+
st.header("Test Parameters")
|
| 53 |
+
test_params = pd.DataFrame({
|
| 54 |
+
"Value": [0, 1, 0, 1, 0]
|
| 55 |
+
}, index=["Temperature", "n", "Presence Penalty", "Top_p", "Frequency Penalty"])
|
| 56 |
+
st.table(test_params)
|
| 57 |
+
|
| 58 |
+
# Apply filtering based on the sidebar selections
|
| 59 |
+
df_filtered = df[df["Type"].isin(source_filter)] if source_filter else df
|
| 60 |
+
|
| 61 |
+
# Choose the correct metric version and compute Accuracy
|
| 62 |
+
df_filtered["Accuracy"] = df_filtered["V1 Accuracy"] if dataset_version == "v1" else df_filtered["V2 Accuracy"]
|
| 63 |
+
df_filtered = df_filtered[["Model", "Type", "Accuracy"]].dropna() # Drop rows with errors
|
| 64 |
+
|
| 65 |
+
# Sort by Accuracy descending and add a Rank column starting from 1
|
| 66 |
+
df_filtered = df_filtered.sort_values("Accuracy", ascending=False).reset_index(drop=True)
|
| 67 |
+
df_filtered.insert(0, "Rank", range(1, len(df_filtered) + 1))
|
| 68 |
+
|
| 69 |
+
# Use columns to display leaderboard and model details side-by-side
|
| 70 |
+
col1, col2 = st.columns([2, 1])
|
| 71 |
+
|
| 72 |
+
with col1:
|
| 73 |
+
st.subheader(f"Leaderboard for SECQA Version {dataset_version}")
|
| 74 |
+
st.dataframe(df_filtered.reset_index(drop=True))
|
| 75 |
+
|
| 76 |
+
with col2:
|
| 77 |
+
st.subheader("Model Details")
|
| 78 |
+
selected_model = st.selectbox("Select a Model", df_filtered["Model"].tolist())
|
| 79 |
+
model_details = df_filtered[df_filtered["Model"] == selected_model].iloc[0]
|
| 80 |
+
st.write(f"**Model:** {model_details['Model']}")
|
| 81 |
+
st.write(f"**Type:** {model_details['Type']}")
|
| 82 |
+
st.write(f"**Accuracy:** {model_details['Accuracy']:.2%}")
|
| 83 |
+
st.write(f"**Rank:** {model_details['Rank']}")
|
| 84 |
+
|
| 85 |
+
# Footer
|
| 86 |
+
st.markdown("---")
|
| 87 |
+
st.info("More dataset benchmarks will be added to this hub in the future.")
|
requirements.txt
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
altair==5.5.0
|
| 2 |
+
attrs==25.3.0
|
| 3 |
+
blinker==1.9.0
|
| 4 |
+
cachetools==5.5.2
|
| 5 |
+
certifi==2025.1.31
|
| 6 |
+
charset-normalizer==3.4.1
|
| 7 |
+
click==8.1.8
|
| 8 |
+
gitdb==4.0.12
|
| 9 |
+
GitPython==3.1.44
|
| 10 |
+
idna==3.10
|
| 11 |
+
Jinja2==3.1.6
|
| 12 |
+
jsonschema==4.23.0
|
| 13 |
+
jsonschema-specifications==2024.10.1
|
| 14 |
+
MarkupSafe==3.0.2
|
| 15 |
+
narwhals==1.31.0
|
| 16 |
+
numpy==2.2.4
|
| 17 |
+
packaging==24.2
|
| 18 |
+
pandas==2.2.3
|
| 19 |
+
pillow==11.1.0
|
| 20 |
+
protobuf==5.29.4
|
| 21 |
+
pyarrow==19.0.1
|
| 22 |
+
pydeck==0.9.1
|
| 23 |
+
python-dateutil==2.9.0.post0
|
| 24 |
+
pytz==2025.1
|
| 25 |
+
referencing==0.36.2
|
| 26 |
+
requests==2.32.3
|
| 27 |
+
rpds-py==0.23.1
|
| 28 |
+
six==1.17.0
|
| 29 |
+
smmap==5.0.2
|
| 30 |
+
streamlit==1.43.2
|
| 31 |
+
tenacity==9.0.0
|
| 32 |
+
toml==0.10.2
|
| 33 |
+
tornado==6.4.2
|
| 34 |
+
typing_extensions==4.12.2
|
| 35 |
+
tzdata==2025.2
|
| 36 |
+
urllib3==2.3.0
|
| 37 |
+
watchdog==6.0.0
|