hou12q's picture
Update app.py
f46c52b verified
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
import matplotlib.pyplot as plt
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
REPORT_TEXT,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
AutoEvalColumn,
ModelType,
fields,
WeightType,
Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation
try:
print(EVAL_REQUESTS_PATH)
snapshot_download(
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
try:
print(EVAL_RESULTS_PATH)
snapshot_download(
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
)
except Exception:
restart_space()
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
label="Select Columns to Display:",
),
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=[
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
ColumnFilter(
AutoEvalColumn.params.name,
type="slider",
min=0.01,
max=150,
label="Select the number of parameters (B)",
),
ColumnFilter(
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
),
],
bool_checkboxgroup_label="Hide models",
interactive=False,
)
def draw_grace_radar():
models = ["LLaMA-2-7b-chat", "Qwen-7B-Chat"]
labels = ["Instruction Following", "Coding", "Math", "Reasoning", "Multilingual"]
scores = [
[0.89, 0.87, 0.82, 0.92, 0.88],
[0.85, 0.84, 0.80, 0.90, 0.91],
]
angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
angles += angles[:1]
fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
for model, score in zip(models, scores):
score += score[:1]
ax.plot(angles, score, label=model)
ax.fill(angles, score, alpha=0.25)
ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
ax.set_thetagrids(np.degrees(angles[:-1]), labels)
ax.set_ylim(0, 1)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.title("GRACE Radar Evaluation")
return fig
# --- 本地加载两个大模型 ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
MODEL_A_PATH = "gpt2"
MODEL_B_PATH = "distilgpt2"
print("Loading Model A...")
tokenizer_a = AutoTokenizer.from_pretrained(MODEL_A_PATH)
model_a = AutoModelForCausalLM.from_pretrained(MODEL_A_PATH).to(device)
print("Model A loaded.")
print("Loading Model B...")
tokenizer_b = AutoTokenizer.from_pretrained(MODEL_B_PATH)
model_b = AutoModelForCausalLM.from_pretrained(MODEL_B_PATH).to(device)
print("Model B loaded.")
def model_a_infer(input_text: str) -> str:
inputs = tokenizer_a(input_text, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model_a.generate(
**inputs,
max_new_tokens=64,
do_sample=True,
temperature=0.8,
pad_token_id=tokenizer_a.eos_token_id,
)
input_len = len(inputs['input_ids'][0])
generated_only = tokenizer_a.decode(outputs[0][input_len:], skip_special_tokens=True)
return generated_only.strip()
def model_b_infer(input_text: str) -> str:
inputs = tokenizer_b(input_text, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model_b.generate(
**inputs,
max_new_tokens=64,
do_sample=True,
temperature=0.8,
pad_token_id=tokenizer_b.eos_token_id,
)
input_len = len(inputs['input_ids'][0])
generated_only = tokenizer_b.decode(outputs[0][input_len:], skip_special_tokens=True)
return generated_only.strip()
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
leaderboard = init_leaderboard(LEADERBOARD_DF)
radar_plot = gr.Plot(value=draw_grace_radar(), label="GRACE Radar Evaluation")
gr.Markdown("本图展示了两个模型在 GRACE 五大任务维度下的性能对比。")
with gr.TabItem("🧪 Arena", elem_id="arena-tab-table", id=4):
gr.Markdown("## 🔁 Arena: 模型同台竞技")
arena_input = gr.Textbox(label="输入文本 (适用于所有模型)", lines=3)
arena_output_a = gr.Textbox(label="模型 A 输出 (GPT2)", lines=6)
arena_output_b = gr.Textbox(label="模型 B 输出 (DistilGPT2)", lines=6)
arena_button = gr.Button("运行 Arena 对比")
def run_arena(text):
if not text.strip():
return "请输入内容", "请输入内容"
return model_a_infer(text), model_b_infer(text)
arena_button.click(run_arena, inputs=arena_input, outputs=[arena_output_a, arena_output_b])
with gr.TabItem("📝 About", elem_id="about-tab-table", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
gr.Markdown(REPORT_TEXT, elem_classes="markdown-text")
with gr.TabItem("🚀 Submit here!", elem_id="submit-tab-table", id=3):
with gr.Column():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Accordion(
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
open=False,
):
gr.Dataframe(
value=finished_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
open=False,
):
gr.Dataframe(
value=running_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
open=False,
):
gr.Dataframe(
value=pending_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
model_type = gr.Dropdown(
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[i.value.name for i in WeightType],
label="Weights type",
multiselect=False,
value="Original",
interactive=True,
)
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
],
submission_result,
)
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()