About
Browse files
app.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
-
st.set_page_config(page_title="Cyber Benchmark Hub:
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
st.title("Cyber Benchmark Hub: SECQA Leaderboard")
|
| 7 |
-
st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
|
| 8 |
|
| 9 |
with st.sidebar:
|
| 10 |
st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg", use_container_width=True)
|
|
@@ -34,6 +34,37 @@ with st.sidebar:
|
|
| 34 |
}, index=["Temperature", "n", "Presence Penalty", "Top_p", "Frequency Penalty"])
|
| 35 |
st.table(test_params)
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
# Determine file path based on dataset choice.
|
| 38 |
# For now, if dataset_choice is "secQA", we use "Benchmark.csv"
|
| 39 |
if dataset_choice == "secQA":
|
|
@@ -89,22 +120,69 @@ df_filtered = df_filtered.sort_values("Accuracy", ascending=False).reset_index(d
|
|
| 89 |
df_filtered['Rank'] = df_filtered['Accuracy'].rank(method='dense', ascending=False).astype(int)
|
| 90 |
df_filtered = df_filtered[['Rank', 'Model', 'Type', 'Accuracy']]
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
st.
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
st.
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
+
st.set_page_config(page_title="Cyber Benchmark Hub: Leaderboard", layout="wide")
|
| 5 |
+
|
| 6 |
+
st.title("Cyber Benchmark Hub: Leaderboard")
|
| 7 |
|
|
|
|
|
|
|
| 8 |
|
| 9 |
with st.sidebar:
|
| 10 |
st.image("https://cdn.prod.website-files.com/630f558f2a15ca1e88a2f774/631f1436ad7a0605fecc5e15_Logo.svg", use_container_width=True)
|
|
|
|
| 34 |
}, index=["Temperature", "n", "Presence Penalty", "Top_p", "Frequency Penalty"])
|
| 35 |
st.table(test_params)
|
| 36 |
|
| 37 |
+
# Function to estimate random baseline accuracy for MCQ datasets
|
| 38 |
+
def estimate_random_accuracy(questions):
|
| 39 |
+
"""
|
| 40 |
+
Estimates the average accuracy when answering questions randomly.
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
questions: List of tuples where each tuple is (question_id, num_choices)
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
The estimated average accuracy (probability of correct answers)
|
| 47 |
+
"""
|
| 48 |
+
if not questions:
|
| 49 |
+
return 0.0
|
| 50 |
+
|
| 51 |
+
total_probability = 0.0
|
| 52 |
+
for question_id, num_choices in questions:
|
| 53 |
+
probability = 1.0 / num_choices
|
| 54 |
+
total_probability += probability
|
| 55 |
+
|
| 56 |
+
average_accuracy = total_probability / len(questions)
|
| 57 |
+
return average_accuracy
|
| 58 |
+
|
| 59 |
+
# For the SECQA dataset we assume each question has 4 choices.
|
| 60 |
+
# According to the dataset card, there are 242 questions.
|
| 61 |
+
total_questions = 242
|
| 62 |
+
questionnaire = [(1, 4), (2, 1), (3, 4), (4, 2), (5, 3), (6, 3), (7, 4), (8, 2), (9, 4), (10, 2), (11, 4), (12, 4), (13, 2), (14, 2), (15, 4), (16, 4), (17, 2), (18, 2), (19, 2), (20, 1), (21, 2), (22, 4), (23, 1), (24, 4), (25, 3), (26, 3), (27, 2), (28, 3), (29, 2), (30, 1), (31, 2), (32, 3), (33, 3), (34, 2), (35, 4), (36, 3), (37, 1), (38, 2), (39, 1), (40, 2), (41, 1), (42, 3), (43, 3), (44, 1), (45, 3), (46, 1), (47, 4), (48, 2), (49, 2), (50, 4), (51, 2), (52, 4), (53, 1), (54, 4), (55, 3), (56, 3), (57, 3), (58, 1), (59, 2), (60, 4), (61, 1), (62, 3), (63, 1), (64, 3), (65, 1), (66, 3), (67, 4), (68, 1), (69, 1), (70, 1), (71, 3), (72, 2), (73, 1), (74, 2), (75, 3), (76, 3), (77, 3), (78, 4), (79, 1), (80, 4), (81, 4), (82, 4), (83, 2), (84, 3), (85, 2), (86, 1), (87, 1), (88, 2), (89, 2), (90, 2), (91, 4), (92, 4), (93, 3), (94, 2), (95, 3), (96, 3), (97, 2), (98, 4), (99, 4), (100, 3), (101, 4), (102, 2), (103, 4), (104, 2), (105, 3), (106, 2), (107, 3), (108, 4), (109, 4), (110, 2)]
|
| 63 |
+
questionnairev2 = [(1, 4), (2, 4), (3, 2), (4, 3), (5, 2), (6, 4), (7, 3), (8, 2), (9, 3), (10, 2), (11, 1), (12, 2), (13, 3), (14, 2), (15, 4), (16, 2), (17, 2), (18, 4), (19, 4), (20, 3), (21, 4), (22, 3), (23, 3), (24, 3), (25, 1), (26, 1), (27, 2), (28, 2), (29, 2), (30, 2), (31, 2), (32, 4), (33, 3), (34, 3), (35, 3), (36, 3), (37, 4), (38, 3), (39, 3), (40, 4), (41, 1), (42, 2), (43, 3), (44, 2), (45, 1), (46, 1), (47, 2), (48, 4), (49, 2), (50, 1), (51, 3), (52, 1), (53, 4), (54, 4), (55, 2), (56, 3), (57, 2), (58, 2), (59, 1), (60, 3), (61, 3), (62, 1), (63, 2), (64, 2), (65, 3), (66, 4), (67, 3), (68, 3), (69, 1), (70, 1), (71, 3), (72, 1), (73, 2), (74, 4), (75, 4), (76, 1), (77, 4), (78, 4), (79, 3), (80, 1), (81, 2), (82, 2), (83, 3), (84, 2), (85, 1), (86, 2), (87, 4), (88, 2), (89, 2), (90, 4), (91, 3), (92, 2), (93, 1), (94, 2), (95, 3), (96, 1), (97, 1), (98, 4), (99, 1), (100, 1)]
|
| 64 |
+
random_accuracy = estimate_random_accuracy(questionnaire)
|
| 65 |
+
random_accuracyv2 = estimate_random_accuracy(questionnairev2)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
# Determine file path based on dataset choice.
|
| 69 |
# For now, if dataset_choice is "secQA", we use "Benchmark.csv"
|
| 70 |
if dataset_choice == "secQA":
|
|
|
|
| 120 |
df_filtered['Rank'] = df_filtered['Accuracy'].rank(method='dense', ascending=False).astype(int)
|
| 121 |
df_filtered = df_filtered[['Rank', 'Model', 'Type', 'Accuracy']]
|
| 122 |
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
tab1, tab2 = st.tabs(["Leaderboard", "About"])
|
| 126 |
+
|
| 127 |
+
with tab1:
|
| 128 |
+
st.markdown("#### [View the SECQA Dataset](https://huggingface.co/datasets/zefang-liu/secqa)")
|
| 129 |
+
|
| 130 |
+
# Use columns to display leaderboard and model details side-by-side
|
| 131 |
+
col1, col2 = st.columns([2, 1])
|
| 132 |
+
|
| 133 |
+
with col1:
|
| 134 |
+
st.subheader(f"Leaderboard for {dataset_choice.upper()} Version {dataset_version}")
|
| 135 |
+
st.dataframe(df_filtered.style.hide(axis='index'))
|
| 136 |
+
|
| 137 |
+
with col2:
|
| 138 |
+
st.subheader("Model Details")
|
| 139 |
+
selected_model = st.selectbox("Select a Model", df_filtered["Model"].tolist())
|
| 140 |
+
model_details = df_filtered[df_filtered["Model"] == selected_model].iloc[0]
|
| 141 |
+
st.write(f"**Model:** {model_details['Model']}")
|
| 142 |
+
st.write(f"**Type:** {model_details['Type']}")
|
| 143 |
+
st.write(f"**Accuracy:** {model_details['Accuracy']:.2%}")
|
| 144 |
+
st.write(f"**Rank:** {model_details['Rank']}")
|
| 145 |
+
|
| 146 |
+
st.divider()
|
| 147 |
+
# Display the random baseline accuracy above the leaderboard
|
| 148 |
+
st.markdown("### Random Baseline Accuracy")
|
| 149 |
+
st.markdown("**{:.2%}** (computed with random guessing on SECQAv1)".format(random_accuracy))
|
| 150 |
+
st.markdown("**{:.2%}** (computed with random guessing on SECQAv2)".format(random_accuracyv2))
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
# Footer
|
| 154 |
+
st.markdown("---")
|
| 155 |
+
st.info("More dataset benchmarks will be added to this hub in the future.")
|
| 156 |
+
|
| 157 |
+
with tab2:
|
| 158 |
+
st.title("About the Cyber Benchmark Hub")
|
| 159 |
+
st.markdown("""
|
| 160 |
+
Welcome to the **Cyber Benchmark Hub: Leaderboard**!
|
| 161 |
+
|
| 162 |
+
This application benchmarks language models on their performance across cybersecurity question-answering tasks using the [SECQA dataset](https://huggingface.co/datasets/zefang-liu/secqa). It provides an interactive interface to explore model accuracy, rank models, and understand how different model types perform on security-centric multiple-choice questions.
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
### Leaderboard Features
|
| 166 |
+
|
| 167 |
+
- Compare **different models** (e.g., GPT, Claude, Mistral) based on SECQA v1 or v2.
|
| 168 |
+
- Filter by **model type/source** (open-source, closed)
|
| 169 |
+
- View **dense rankings** where models with equal accuracy share the same rank.
|
| 170 |
+
- See detailed information for each model, including:
|
| 171 |
+
- Accuracy score
|
| 172 |
+
- Rank
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
### Random Baseline Accuracy
|
| 176 |
+
|
| 177 |
+
The app computes the **expected accuracy** if a model guessed randomly on all questions:
|
| 178 |
+
|
| 179 |
+
This helps contextualize the actual performance of models.
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
### Built by
|
| 184 |
+
|
| 185 |
+
[Priam.ai](https://www.priam.ai/)
|
| 186 |
+
|
| 187 |
+
*This benchmark hub will continue to expand as more models and datasets are released in the cybersecurity NLP space.*
|
| 188 |
+
""") # Replace with actual random_accuracy values if available
|