3ndetz's picture
Update app.py
a0f0a1a verified
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
import sys
import os
import json
import datetime
from typing import Dict, List, Tuple, Union
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision
)
from src.display.formatting import styled_error, styled_message, styled_warning
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, RESULTS_PATH, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
def restart_space():
API.restart_space(repo_id=REPO_ID)
if False:
### Space initialisation
try:
print(EVAL_REQUESTS_PATH)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
#LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
#(
# finished_eval_queue_df,
# running_eval_queue_df,
# pending_eval_queue_df,
#) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def calculate_leaderboard(df: pd.DataFrame) -> pd.DataFrame:
"""
Вычисляет лидерборд на основе данных в DataFrame.
Args:
df (pd.DataFrame): Исходный DataFrame с данными.
Returns:
pd.DataFrame: DataFrame с вычисленным лидербордом.
"""
if df.shape[0] == 0:
print(
"No data to calculate leaderboard. Returning original DataFrame."
)
return df
result = pd.DataFrame()
categories = df["vul_deepeval"].unique()
for category in categories:
tmp = df[df["vul_deepeval"] == category]
tmp_2 = (
tmp.groupby("agent_name")["score"]
.mean()
.reset_index(name=category)
.sort_values("agent_name")
)
if result.shape[0] == 0:
result = pd.concat([result, tmp_2], axis=1)
else:
result = pd.concat([result, tmp_2[category]], axis=1)
# Переупорядочиваем столбцы, чтобы 'model_name' был первым
result = result[["agent_name"] + [c for c in result.columns if c != "agent_name"]]
return result.round(2)
def filter_dataframe(
df: pd.DataFrame,
show_manually_tested: bool,
query: str,
high_level_categories: List[str],
low_level_categories: List[str],
) -> pd.DataFrame:
"""
Фильтрует DataFrame на основе выбранных категорий и запроса.
Args:
df (pd.DataFrame): Исходный DataFrame.
show_manually_tested (bool): Флаг, указывающий, показывать ли данные,
прошедшие ручное тестирование.
query (str): Строка запроса для фильтрации по имени модели.
high_level_categories (List[str]): Список выбранных категорий высокого уровня.
low_level_categories (List[str]): Список выбранных категорий низкого уровня.
Returns:
pd.DataFrame: Отфильтрованный DataFrame.
"""
if not show_manually_tested:
filtered_df = df[~df["manually_tested"]]
else:
filtered_df = df
mask = (
filtered_df["type_general"].isin(high_level_categories)
& filtered_df["vul_deepeval"].isin(low_level_categories)
& filtered_df["agent_name"].str.contains(query, case=False, na=False)
)
return filtered_df[mask]
def update_table(
df: pd.DataFrame,
show_manually_tested: bool,
query: str,
high_level_categories: List[str],
low_level_categories: List[str],
) -> pd.DataFrame:
"""
Обновляет таблицу на основе фильтрованных данных.
Args:
df (pd.DataFrame): Исходный DataFrame.
show_manually_tested (bool): Флаг, указывающий, показывать ли
данные, прошедшие ручное тестирование.
query (str): Строка запроса для фильтрации по имени модели.
high_level_categories (List[str]): Список выбранных категорий высокого уровня.
low_level_categories (List[str]): Список выбранных категорий низкого уровня.
Returns:
pd.DataFrame: Отфильтрованный и обновленный DataFrame.
"""
filtered_df = filter_dataframe(
df,
show_manually_tested,
query,
high_level_categories,
low_level_categories,
)
result = calculate_leaderboard(filtered_df)
return result
def get_categories_mapping(
df: pd.DataFrame,
) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
"""
Возвращает словари соответствия между уровнями категорий.
Args:
df (pd.DataFrame): Исходный DataFrame.
Returns:
Tuple[Dict[str, List[str]], Dict[str, List[str]]]: Кортеж из двух словарей:
- high2low: Словарь, где ключи - категории высокого уровня,
значения - списки категорий низкого уровня.
- low2high: Словарь, где ключи - категории низкого уровня,
значения - списки категорий высокого уровня.
"""
high2low = df.groupby("type_general")["vul_deepeval"].apply(list).to_dict()
low2high = df.groupby("vul_deepeval")["type_general"].apply(list).to_dict()
return high2low, low2high
def update_categories(
high_level_categories: List[str],
high2low: Dict[str, List[str]],
) -> Union[Tuple[gr.update, gr.update], gr.update]:
"""
Обновляет выбор категорий на основе выбранного уровня.
Args:
high_level_categories (List[str]): Список выбранных категорий высокого уровня.
high2low (Dict[str, List[str]]): Словарь соответствия между категориями
высокого и низкого уровня.
Returns:
Union[Tuple[gr.update, gr.update], gr.update]:
Обновленные значения для категорий низкого уровня.
"""
low_levels_list = set()
for hlc in high_level_categories:
for item in high2low[hlc]:
low_levels_list.add(item)
low_levels_list = list(low_levels_list)
return gr.update(choices=low_levels_list, value=low_levels_list)
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
label="Select Columns to Display:",
),
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=[
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
ColumnFilter(
AutoEvalColumn.params.name,
type="slider",
min=0.01,
max=150,
label="Select the number of parameters (B)",
),
ColumnFilter(
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
),
],
bool_checkboxgroup_label="Hide models",
interactive=False,
)
leaderboard_df_raw, high2low, low2high, leaderboard_table, leaderboard_table_raw = (
None,
None,
None,
None,
None,
)
def load_all_result_submits() -> List[Dict]:
path = RESULTS_PATH # RESULTS_REPO
# create an empty list to store the data
data = []
# iterate over all files in the directory
for filename in os.listdir(path):
# check if the file is a .json file
if filename.endswith('.json'):
# open the file and load the JSON data
with open(os.path.join(path, filename), 'r') as f:
json_data = json.load(f)
# assume the JSON data contains a list of dictionaries
# extract the list and add it to the data list
data.extend(json_data)
# create a Pandas DataFrame from the combined data
return data
def initialize_leaderboard() -> Tuple[pd.DataFrame, Dict[str, List[str]], Dict[str, List[str]]]:
"""
Инициализирует данные лидерборда, получая данные от бэкенда и формируя DataFrame.
Args:
backend_client: Объект клиента бэкенда для получения данных.
Returns:
Tuple[pd.DataFrame, Dict[str, List[str]], Dict[str, List[str]]]: Кортеж, содержащий:
- leaderboard_df_raw: Отсортированный по убыванию баллов DataFrame с данными лидерборда.
- high2low: Словарь, где ключи - категории высокого уровня,
значения - списки категорий низкого уровня.
- low2high: Словарь, где ключи - категории низкого уровня,
значения - списки категорий высокого уровня.
"""
try:
leaderboard_competitors = load_all_result_submits() # JSON backend_client.fetch_leaderboard_competitors()
leaderboard_df_raw = pd.DataFrame(leaderboard_competitors).sort_values(
"score", ascending=False
)
cat_columns = leaderboard_df_raw["vul_deepeval"].unique()
d = {
c: c.replace("RTVulnerability.", "")
.replace("HARMFUL_", "")
.replace("_", " ")
.lower()
for c in cat_columns
}
leaderboard_df_raw["vul_deepeval"] = leaderboard_df_raw["vul_deepeval"].replace(
d
)
high2low, low2high = get_categories_mapping(leaderboard_df_raw)
return leaderboard_df_raw, high2low, low2high
except Exception as e:
print(f"Error initializing leaderboard: {e}")
#logger.error(f"Error initializing leaderboard: {e}")
return pd.DataFrame(), {}, {}
import threading
import time
scheluder = None
def save_json_results(grFile):
global leaderboard_df_raw, high2low, low2high
path = RESULTS_PATH
#filename = grFile.name
json_str = grFile.decode("utf-8")
json_dict = json.loads(json_str)
model = json_dict[0]["agent_name"]
filename = model.replace("/","_")
print("user loading filename " + filename)
out_path = (f"{path}/{filename}.json")
out_path_abs = os.path.abspath(out_path)
if not os.path.exists(out_path_abs):
with open(out_path_abs, "w", encoding='utf-8') as fp:
json.dump(json_dict, fp, ensure_ascii=False)
print("saved results to json ",str(out_path_abs))
#print("Uploading eval file")
def inner_commit():
API.upload_file(
path_or_fileobj=out_path_abs,
path_in_repo=out_path,
#path_in_repo=out_path.split("eval-queue/")[1],
repo_id=REPO_ID,
repo_type="space",
commit_message=f"Add {model} to results",
)
run_date = datetime.datetime.now() + datetime.timedelta(seconds=10)
scheduler.add_job(inner_commit, 'date', run_date=run_date) # adjust the date and time accordingly
## Remove the local file
#os.remove(out_path_abs)
leaderboard_df_raw, high2low, low2high = initialize_leaderboard()
print("reinitialized leaderboard")
return styled_message(
f"Your request of {model} has been submitted!"
)
else:
return styled_message(
f"{model} already exists!"
)
leaderboard_df_raw, high2low, low2high = initialize_leaderboard()
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("Leaderboard"):
with gr.Blocks():
gr.Markdown("# LLM safety leaderboard")
with gr.Row():
high_level_categories = gr.CheckboxGroup(
choices=list(high2low.keys()),
value=list(high2low.keys()),
label="Select High Level Attack Category",
interactive=True,
)
with gr.Row():
low_level_categories = gr.CheckboxGroup(
choices=list(low2high.keys()),
value=list(low2high.keys()),
label="Select Low Level Attack Category",
interactive=True,
)
high_level_categories.change(
lambda hlc: update_categories(hlc, high2low),
inputs=[high_level_categories],
outputs=low_level_categories,
queue=True,
)
with gr.Row():
search_bar = gr.Textbox(
placeholder=" 🔍 Search for your model and press ENTER...",
show_label=False,
)
manually_tested_visibility = gr.Checkbox(
value=True,
label="Show manually tested agents",
interactive=True,
)
calculated_leaderboard = calculate_leaderboard(leaderboard_df_raw)
leaderboard_table = gr.DataFrame(
value=calculated_leaderboard,
headers=calculated_leaderboard.columns.to_list(),
interactive=False,
visible=True,
col_count=len(calculated_leaderboard.columns.to_list()),
)
leaderboard_table_raw = gr.DataFrame(
value=leaderboard_df_raw,
headers=leaderboard_df_raw.columns.to_list(),
visible=False,
col_count=len(leaderboard_df_raw.columns.to_list()),
)
search_bar.submit(
update_table,
[
leaderboard_table_raw,
manually_tested_visibility,
search_bar,
high_level_categories,
low_level_categories,
],
leaderboard_table,
)
for selector in [
manually_tested_visibility,
high_level_categories,
low_level_categories,
]:
selector.change(
update_table,
[
leaderboard_table_raw,
manually_tested_visibility,
search_bar,
high_level_categories,
low_level_categories,
],
leaderboard_table,
)
with gr.TabItem("Sumbit"):
with gr.Blocks():
with gr.Row():
with gr.Accordion("Automated testing"):
pass
with gr.Row():
with gr.Accordion("Manual testing"):
gr.Interface(
#fn=lambda a: print('lol imported binary file gege')
#fn=backend_client.send_file_to_backend,
fn=save_json_results,
inputs=gr.File(
label="Загрузите файл",
type="binary",
),
outputs="text",
description="Загрузите файл и посмотрите результат.",)
if False:
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Column():
with gr.Accordion(
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
open=False,
):
with gr.Row():
finished_eval_table = gr.components.Dataframe(
value=finished_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
open=False,
):
with gr.Row():
running_eval_table = gr.components.Dataframe(
value=running_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
open=False,
):
with gr.Row():
pending_eval_table = gr.components.Dataframe(
value=pending_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Row():
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
model_type = gr.Dropdown(
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[i.value.name for i in WeightType],
label="Weights type",
multiselect=False,
value="Original",
interactive=True,
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
],
submission_result,
)
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()
#demo.launch()