| import gradio as gr |
| from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns |
| import pandas as pd |
| from apscheduler.schedulers.background import BackgroundScheduler |
| from huggingface_hub import snapshot_download |
| import sys |
| import os |
| import json |
| import datetime |
| from typing import Dict, List, Tuple, Union |
|
|
| from src.about import ( |
| CITATION_BUTTON_LABEL, |
| CITATION_BUTTON_TEXT, |
| EVALUATION_QUEUE_TEXT, |
| INTRODUCTION_TEXT, |
| LLM_BENCHMARKS_TEXT, |
| TITLE, |
| ) |
| from src.display.css_html_js import custom_css |
| from src.display.utils import ( |
| BENCHMARK_COLS, |
| COLS, |
| EVAL_COLS, |
| EVAL_TYPES, |
| AutoEvalColumn, |
| ModelType, |
| fields, |
| WeightType, |
| Precision |
| ) |
| from src.display.formatting import styled_error, styled_message, styled_warning |
| from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, RESULTS_PATH, TOKEN |
| from src.populate import get_evaluation_queue_df, get_leaderboard_df |
| from src.submission.submit import add_new_eval |
|
|
|
|
| def restart_space(): |
| API.restart_space(repo_id=REPO_ID) |
| |
| if False: |
| |
| try: |
| print(EVAL_REQUESTS_PATH) |
| snapshot_download( |
| repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
| ) |
| except Exception: |
| restart_space() |
| try: |
| print(EVAL_RESULTS_PATH) |
| snapshot_download( |
| repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN |
| ) |
| except Exception: |
| restart_space() |
|
|
|
|
| |
|
|
| |
| |
| |
| |
| |
|
|
| def calculate_leaderboard(df: pd.DataFrame) -> pd.DataFrame: |
| """ |
| Вычисляет лидерборд на основе данных в DataFrame. |
| |
| Args: |
| df (pd.DataFrame): Исходный DataFrame с данными. |
| |
| Returns: |
| pd.DataFrame: DataFrame с вычисленным лидербордом. |
| """ |
| if df.shape[0] == 0: |
| print( |
| "No data to calculate leaderboard. Returning original DataFrame." |
| ) |
| return df |
|
|
| result = pd.DataFrame() |
| categories = df["vul_deepeval"].unique() |
|
|
| for category in categories: |
| tmp = df[df["vul_deepeval"] == category] |
| tmp_2 = ( |
| tmp.groupby("agent_name")["score"] |
| .mean() |
| .reset_index(name=category) |
| .sort_values("agent_name") |
| ) |
|
|
| if result.shape[0] == 0: |
| result = pd.concat([result, tmp_2], axis=1) |
| else: |
| result = pd.concat([result, tmp_2[category]], axis=1) |
|
|
| |
| result = result[["agent_name"] + [c for c in result.columns if c != "agent_name"]] |
|
|
| return result.round(2) |
|
|
|
|
| def filter_dataframe( |
| df: pd.DataFrame, |
| show_manually_tested: bool, |
| query: str, |
| high_level_categories: List[str], |
| low_level_categories: List[str], |
| ) -> pd.DataFrame: |
| """ |
| Фильтрует DataFrame на основе выбранных категорий и запроса. |
| |
| Args: |
| df (pd.DataFrame): Исходный DataFrame. |
| show_manually_tested (bool): Флаг, указывающий, показывать ли данные, |
| прошедшие ручное тестирование. |
| query (str): Строка запроса для фильтрации по имени модели. |
| high_level_categories (List[str]): Список выбранных категорий высокого уровня. |
| low_level_categories (List[str]): Список выбранных категорий низкого уровня. |
| |
| Returns: |
| pd.DataFrame: Отфильтрованный DataFrame. |
| """ |
| if not show_manually_tested: |
| filtered_df = df[~df["manually_tested"]] |
| else: |
| filtered_df = df |
|
|
| mask = ( |
| filtered_df["type_general"].isin(high_level_categories) |
| & filtered_df["vul_deepeval"].isin(low_level_categories) |
| & filtered_df["agent_name"].str.contains(query, case=False, na=False) |
| ) |
|
|
| return filtered_df[mask] |
|
|
|
|
| def update_table( |
| df: pd.DataFrame, |
| show_manually_tested: bool, |
| query: str, |
| high_level_categories: List[str], |
| low_level_categories: List[str], |
| ) -> pd.DataFrame: |
| """ |
| Обновляет таблицу на основе фильтрованных данных. |
| |
| Args: |
| df (pd.DataFrame): Исходный DataFrame. |
| show_manually_tested (bool): Флаг, указывающий, показывать ли |
| данные, прошедшие ручное тестирование. |
| query (str): Строка запроса для фильтрации по имени модели. |
| high_level_categories (List[str]): Список выбранных категорий высокого уровня. |
| low_level_categories (List[str]): Список выбранных категорий низкого уровня. |
| |
| Returns: |
| pd.DataFrame: Отфильтрованный и обновленный DataFrame. |
| """ |
| filtered_df = filter_dataframe( |
| df, |
| show_manually_tested, |
| query, |
| high_level_categories, |
| low_level_categories, |
| ) |
|
|
| result = calculate_leaderboard(filtered_df) |
|
|
| return result |
|
|
|
|
| def get_categories_mapping( |
| df: pd.DataFrame, |
| ) -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]: |
| """ |
| Возвращает словари соответствия между уровнями категорий. |
| |
| Args: |
| df (pd.DataFrame): Исходный DataFrame. |
| |
| Returns: |
| Tuple[Dict[str, List[str]], Dict[str, List[str]]]: Кортеж из двух словарей: |
| - high2low: Словарь, где ключи - категории высокого уровня, |
| значения - списки категорий низкого уровня. |
| - low2high: Словарь, где ключи - категории низкого уровня, |
| значения - списки категорий высокого уровня. |
| """ |
| high2low = df.groupby("type_general")["vul_deepeval"].apply(list).to_dict() |
| low2high = df.groupby("vul_deepeval")["type_general"].apply(list).to_dict() |
| return high2low, low2high |
|
|
|
|
| def update_categories( |
| high_level_categories: List[str], |
| high2low: Dict[str, List[str]], |
| ) -> Union[Tuple[gr.update, gr.update], gr.update]: |
| """ |
| Обновляет выбор категорий на основе выбранного уровня. |
| |
| Args: |
| high_level_categories (List[str]): Список выбранных категорий высокого уровня. |
| high2low (Dict[str, List[str]]): Словарь соответствия между категориями |
| высокого и низкого уровня. |
| |
| Returns: |
| Union[Tuple[gr.update, gr.update], gr.update]: |
| Обновленные значения для категорий низкого уровня. |
| """ |
| low_levels_list = set() |
| for hlc in high_level_categories: |
| for item in high2low[hlc]: |
| low_levels_list.add(item) |
|
|
| low_levels_list = list(low_levels_list) |
| return gr.update(choices=low_levels_list, value=low_levels_list) |
|
|
| def init_leaderboard(dataframe): |
| if dataframe is None or dataframe.empty: |
| raise ValueError("Leaderboard DataFrame is empty or None.") |
| return Leaderboard( |
| value=dataframe, |
| datatype=[c.type for c in fields(AutoEvalColumn)], |
| select_columns=SelectColumns( |
| default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], |
| cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], |
| label="Select Columns to Display:", |
| ), |
| search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], |
| hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], |
| filter_columns=[ |
| ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"), |
| ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"), |
| ColumnFilter( |
| AutoEvalColumn.params.name, |
| type="slider", |
| min=0.01, |
| max=150, |
| label="Select the number of parameters (B)", |
| ), |
| ColumnFilter( |
| AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True |
| ), |
| ], |
| bool_checkboxgroup_label="Hide models", |
| interactive=False, |
| ) |
|
|
| leaderboard_df_raw, high2low, low2high, leaderboard_table, leaderboard_table_raw = ( |
| None, |
| None, |
| None, |
| None, |
| None, |
| ) |
|
|
| def load_all_result_submits() -> List[Dict]: |
| path = RESULTS_PATH |
| |
| |
| data = [] |
| |
| |
| for filename in os.listdir(path): |
| |
| if filename.endswith('.json'): |
| |
| with open(os.path.join(path, filename), 'r') as f: |
| json_data = json.load(f) |
| |
| |
| |
| data.extend(json_data) |
| |
| |
| return data |
|
|
| def initialize_leaderboard() -> Tuple[pd.DataFrame, Dict[str, List[str]], Dict[str, List[str]]]: |
| """ |
| Инициализирует данные лидерборда, получая данные от бэкенда и формируя DataFrame. |
| |
| Args: |
| backend_client: Объект клиента бэкенда для получения данных. |
| |
| Returns: |
| Tuple[pd.DataFrame, Dict[str, List[str]], Dict[str, List[str]]]: Кортеж, содержащий: |
| - leaderboard_df_raw: Отсортированный по убыванию баллов DataFrame с данными лидерборда. |
| - high2low: Словарь, где ключи - категории высокого уровня, |
| значения - списки категорий низкого уровня. |
| - low2high: Словарь, где ключи - категории низкого уровня, |
| значения - списки категорий высокого уровня. |
| """ |
| try: |
| leaderboard_competitors = load_all_result_submits() |
| leaderboard_df_raw = pd.DataFrame(leaderboard_competitors).sort_values( |
| "score", ascending=False |
| ) |
| cat_columns = leaderboard_df_raw["vul_deepeval"].unique() |
|
|
| d = { |
| c: c.replace("RTVulnerability.", "") |
| .replace("HARMFUL_", "") |
| .replace("_", " ") |
| .lower() |
| for c in cat_columns |
| } |
| leaderboard_df_raw["vul_deepeval"] = leaderboard_df_raw["vul_deepeval"].replace( |
| d |
| ) |
|
|
| high2low, low2high = get_categories_mapping(leaderboard_df_raw) |
| return leaderboard_df_raw, high2low, low2high |
| except Exception as e: |
| print(f"Error initializing leaderboard: {e}") |
| |
| return pd.DataFrame(), {}, {} |
| import threading |
| import time |
| scheluder = None |
| def save_json_results(grFile): |
| global leaderboard_df_raw, high2low, low2high |
| path = RESULTS_PATH |
| |
| json_str = grFile.decode("utf-8") |
| json_dict = json.loads(json_str) |
| model = json_dict[0]["agent_name"] |
| filename = model.replace("/","_") |
| print("user loading filename " + filename) |
| out_path = (f"{path}/{filename}.json") |
| out_path_abs = os.path.abspath(out_path) |
| if not os.path.exists(out_path_abs): |
|
|
| with open(out_path_abs, "w", encoding='utf-8') as fp: |
| json.dump(json_dict, fp, ensure_ascii=False) |
| print("saved results to json ",str(out_path_abs)) |
| |
| def inner_commit(): |
| API.upload_file( |
| path_or_fileobj=out_path_abs, |
| path_in_repo=out_path, |
| |
| repo_id=REPO_ID, |
| repo_type="space", |
| commit_message=f"Add {model} to results", |
| ) |
| run_date = datetime.datetime.now() + datetime.timedelta(seconds=10) |
| scheduler.add_job(inner_commit, 'date', run_date=run_date) |
| |
| |
| leaderboard_df_raw, high2low, low2high = initialize_leaderboard() |
| print("reinitialized leaderboard") |
| return styled_message( |
| f"Your request of {model} has been submitted!" |
| ) |
| else: |
| return styled_message( |
| f"{model} already exists!" |
| ) |
| |
| leaderboard_df_raw, high2low, low2high = initialize_leaderboard() |
| demo = gr.Blocks(css=custom_css) |
| with demo: |
| gr.HTML(TITLE) |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
| |
| |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: |
| with gr.TabItem("Leaderboard"): |
| with gr.Blocks(): |
| gr.Markdown("# LLM safety leaderboard") |
| with gr.Row(): |
| high_level_categories = gr.CheckboxGroup( |
| choices=list(high2low.keys()), |
| value=list(high2low.keys()), |
| label="Select High Level Attack Category", |
| interactive=True, |
| ) |
| with gr.Row(): |
| low_level_categories = gr.CheckboxGroup( |
| choices=list(low2high.keys()), |
| value=list(low2high.keys()), |
| label="Select Low Level Attack Category", |
| interactive=True, |
| ) |
|
|
| high_level_categories.change( |
| lambda hlc: update_categories(hlc, high2low), |
| inputs=[high_level_categories], |
| outputs=low_level_categories, |
| queue=True, |
| ) |
|
|
| with gr.Row(): |
| search_bar = gr.Textbox( |
| placeholder=" 🔍 Search for your model and press ENTER...", |
| show_label=False, |
| ) |
| manually_tested_visibility = gr.Checkbox( |
| value=True, |
| label="Show manually tested agents", |
| interactive=True, |
| ) |
| calculated_leaderboard = calculate_leaderboard(leaderboard_df_raw) |
| leaderboard_table = gr.DataFrame( |
| value=calculated_leaderboard, |
| headers=calculated_leaderboard.columns.to_list(), |
| interactive=False, |
| visible=True, |
| col_count=len(calculated_leaderboard.columns.to_list()), |
| ) |
| leaderboard_table_raw = gr.DataFrame( |
| value=leaderboard_df_raw, |
| headers=leaderboard_df_raw.columns.to_list(), |
| visible=False, |
| col_count=len(leaderboard_df_raw.columns.to_list()), |
| ) |
|
|
| search_bar.submit( |
| update_table, |
| [ |
| leaderboard_table_raw, |
| manually_tested_visibility, |
| search_bar, |
| high_level_categories, |
| low_level_categories, |
| ], |
| leaderboard_table, |
| ) |
|
|
| for selector in [ |
| manually_tested_visibility, |
| high_level_categories, |
| low_level_categories, |
| ]: |
| selector.change( |
| update_table, |
| [ |
| leaderboard_table_raw, |
| manually_tested_visibility, |
| search_bar, |
| high_level_categories, |
| low_level_categories, |
| ], |
| leaderboard_table, |
| ) |
|
|
| with gr.TabItem("Sumbit"): |
| with gr.Blocks(): |
| with gr.Row(): |
| with gr.Accordion("Automated testing"): |
| pass |
|
|
| with gr.Row(): |
| with gr.Accordion("Manual testing"): |
| gr.Interface( |
| |
| |
| fn=save_json_results, |
| inputs=gr.File( |
| label="Загрузите файл", |
| type="binary", |
| ), |
| outputs="text", |
| description="Загрузите файл и посмотрите результат.",) |
| if False: |
| with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0): |
| leaderboard = init_leaderboard(LEADERBOARD_DF) |
| |
| with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2): |
| gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") |
| |
| with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3): |
| with gr.Column(): |
| with gr.Row(): |
| gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") |
| |
| with gr.Column(): |
| with gr.Accordion( |
| f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", |
| open=False, |
| ): |
| with gr.Row(): |
| finished_eval_table = gr.components.Dataframe( |
| value=finished_eval_queue_df, |
| headers=EVAL_COLS, |
| datatype=EVAL_TYPES, |
| row_count=5, |
| ) |
| with gr.Accordion( |
| f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", |
| open=False, |
| ): |
| with gr.Row(): |
| running_eval_table = gr.components.Dataframe( |
| value=running_eval_queue_df, |
| headers=EVAL_COLS, |
| datatype=EVAL_TYPES, |
| row_count=5, |
| ) |
| |
| with gr.Accordion( |
| f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", |
| open=False, |
| ): |
| with gr.Row(): |
| pending_eval_table = gr.components.Dataframe( |
| value=pending_eval_queue_df, |
| headers=EVAL_COLS, |
| datatype=EVAL_TYPES, |
| row_count=5, |
| ) |
| with gr.Row(): |
| gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text") |
| |
| with gr.Row(): |
| with gr.Column(): |
| model_name_textbox = gr.Textbox(label="Model name") |
| revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") |
| model_type = gr.Dropdown( |
| choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], |
| label="Model type", |
| multiselect=False, |
| value=None, |
| interactive=True, |
| ) |
| |
| with gr.Column(): |
| precision = gr.Dropdown( |
| choices=[i.value.name for i in Precision if i != Precision.Unknown], |
| label="Precision", |
| multiselect=False, |
| value="float16", |
| interactive=True, |
| ) |
| weight_type = gr.Dropdown( |
| choices=[i.value.name for i in WeightType], |
| label="Weights type", |
| multiselect=False, |
| value="Original", |
| interactive=True, |
| ) |
| base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") |
| |
| submit_button = gr.Button("Submit Eval") |
| submission_result = gr.Markdown() |
| submit_button.click( |
| add_new_eval, |
| [ |
| model_name_textbox, |
| base_model_name_textbox, |
| revision_name_textbox, |
| precision, |
| weight_type, |
| model_type, |
| ], |
| submission_result, |
| ) |
|
|
| with gr.Row(): |
| with gr.Accordion("📙 Citation", open=False): |
| citation_button = gr.Textbox( |
| value=CITATION_BUTTON_TEXT, |
| label=CITATION_BUTTON_LABEL, |
| lines=20, |
| elem_id="citation-button", |
| show_copy_button=True, |
| ) |
|
|
| scheduler = BackgroundScheduler() |
| scheduler.add_job(restart_space, "interval", seconds=1800) |
| scheduler.start() |
| demo.queue(default_concurrency_limit=40).launch() |
| |