work / app.py
hellokawei's picture
Update app.py
4f244aa verified
raw
history blame
20.9 kB
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
import os
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch # 导入 torch
# 从现有的 src 导入,这些我们无法修改,但需要继续使用其提供的功能
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT, # 这个可能不再需要,但保留以防万一
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
# =====================================================================
# **重要修改开始:直接在 app.py 中定义 GRACE 相关的类和函数**
# =====================================================================
from enum import Enum
from typing import NamedTuple, List
class Column(NamedTuple):
name: str
type: str
displayed_by_default: bool = True
never_hidden: bool = False
hidden: bool = False
filterable: bool = True
class AutoEvalColumn(Enum):
model = Column("Model", "str", displayed_by_default=True, never_hidden=True)
model_type = Column("Model type", "str", displayed_by_default=True)
precision = Column("Precision", "str", displayed_by_default=False)
params = Column("Params (B)", "number", displayed_by_default=True)
license = Column("License", "str", displayed_by_default=False)
still_on_hub = Column("On Hub", "boolean", displayed_by_default=True, hidden=True)
# GRACE 框架新增列
generalization_score = Column("G: 泛化性", "number", displayed_by_default=True, filterable=True)
relevance_score = Column("R: 相关性", "number", displayed_by_default=True, filterable=True)
artistry_score = Column("A: 创新表现力", "number", displayed_by_default=True, filterable=True)
consistency_score = Column("C: 一致性", "number", displayed_by_default=True, filterable=True)
efficiency_score = Column("E: 效率性", "number", displayed_by_default=True, filterable=True)
def fields(cls: type) -> List[Column]:
return [c.value for c in cls if isinstance(c.value, Column)]
class ModelType(Enum):
LanguageModeling = "语言生成模型"
ImageGeneration = "图像生成模型"
Unknown = "未知"
def to_str(self, sep: str = " : ") -> str:
return f"{self.name}{sep}{self.value}"
class WeightType(Enum):
Original = NamedTuple("Original", [("name", str)])("Original")
Lora = NamedTuple("Lora", [("name", str)])("Lora")
class Precision(Enum):
float16 = NamedTuple("float16", [("name", str)])("float16")
bfloat16 = NamedTuple("bfloat16", [("name", str)])("bfloat16")
Unknown = NamedTuple("Unknown", [("name", str)])("Unknown")
COLS = fields(AutoEvalColumn)
BENCHMARK_COLS = [
AutoEvalColumn.model.value,
AutoEvalColumn.params.value,
AutoEvalColumn.generalization_score.value,
AutoEvalColumn.relevance_score.value,
AutoEvalColumn.artistry_score.value,
AutoEvalColumn.consistency_score.value,
AutoEvalColumn.efficiency_score.value,
]
EVAL_COLS = [c.name for c in fields(AutoEvalColumn)]
EVAL_TYPES = [c.type for c in fields(AutoEvalColumn)]
# 简化 get_leaderboard_df 和 get_evaluation_queue_df
# 由于我们是手动比较,而不是自动评估,这些函数更多是用于显示模拟数据
def get_leaderboard_df(eval_results_path: str, eval_requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
print("使用模拟数据填充排行榜。")
# 这里我们不再尝试从文件读取,直接生成模拟数据
all_results = [
{
"Model": "Gemma 2B Instruct", # 使用友好的名称
"Model type": ModelType.LanguageModeling.to_str(),
"Precision": Precision.float16.value.name,
"Params (B)": 2.0,
"License": "apache-2.0",
"On Hub": True,
"G: 泛化性": 0.0, # 初始为0,等待用户输入
"R: 相关性": 0.0,
"A: 创新表现力": 0.0,
"C: 一致性": 0.0,
"E: 效率性": 0.0,
},
{
"Model": "Phi-2", # 使用友好的名称
"Model type": ModelType.LanguageModeling.to_str(),
"Precision": Precision.float16.value.name,
"Params (B)": 2.7,
"License": "mit",
"On Hub": True,
"G: 泛化性": 0.0,
"R: 相关性": 0.0,
"A: 创新表现力": 0.0,
"C: 一致性": 0.0,
"E: 效率性": 0.0,
},
{
"Model": "GPT-Neo 125M", # 使用友好的名称
"Model type": ModelType.LanguageModeling.to_str(),
"Precision": Precision.float16.value.name,
"Params (B)": 0.125,
"License": "apache-2.0",
"On Hub": True,
"G: 泛化性": 0.0,
"R: 相关性": 0.0,
"A: 创新表现力": 0.0,
"C: 一致性": 0.0,
"E: 效率性": 0.0,
}
]
df = pd.DataFrame(all_results)
# 对 DataFrame 进行必要的处理,例如排序 (这里不需要排序因为分数是0)
return df
def get_evaluation_queue_df(eval_requests_path: str, eval_cols: list):
# 评估队列不再是主要功能,返回空 DataFrame
empty_df = pd.DataFrame(columns=eval_cols)
return empty_df, empty_df, empty_df
# =====================================================================
# **重要修改结束**
# =====================================================================
# 假设 src.envs 中的 API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN 可用
# 如果 TOKEN 未在 src.envs 中定义,您需要在 Hugging Face Space Secrets 中设置 HF_TOKEN。
# 这里为了能运行,我们直接使用 os.getenv 获取 TOKEN。
TOKEN = os.getenv("HF_TOKEN") # 确保您的 Space Secrets 中设置了 HF_TOKEN
# 假设这些路径是可写的,但在此场景下,我们不再依赖它们来存储评估结果
EVAL_REQUESTS_PATH = "./eval_requests"
EVAL_RESULTS_PATH = "./eval_results"
# 对于演示,我们不需要实际的 API 调用来重启 Space 或提交任务
# 所以我们可以创建一个模拟的 API 类
class MockAPI:
def restart_space(self, repo_id: str):
print(f"MockAPI: Restarting space {repo_id}. (No actual restart for demo)")
class MockSubmit:
def add_new_eval(self, *args):
# 这个函数不再用于实际提交,可以返回一个消息
return "在此演示中,模型已预先加载,无需提交新评估。"
API = MockAPI()
add_new_eval = MockSubmit().add_new_eval
REPO_ID = os.getenv("HF_SPACE_ID", "your-org/your-space-name") # 从环境变量获取 Space ID,或者设置默认值
# 预加载模型和分词器
# 考虑到免费 Space 的资源限制,这里选择较小的模型
MODELS_TO_COMPARE = [
{"id": "google/gemma-2b-it", "name": "Gemma 2B Instruct"},
{"id": "microsoft/phi-2", "name": "Phi-2"},
{"id": "EleutherAI/gpt-neo-125m", "name": "GPT-Neo 125M"}, # 更小的模型,确保加载
]
# 用于存储加载的模型和分词器
loaded_models = {}
def load_models():
global loaded_models
for model_info in MODELS_TO_COMPARE:
model_id = model_info["id"]
model_name = model_info["name"]
print(f"正在加载模型: {model_name} ({model_id})...")
try:
# 尝试加载模型到 GPU (cuda) 或 CPU (cpu)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"模型 {model_id} 将加载到 {device}")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=TOKEN)
# 使用 torch.float16 或 torch.bfloat16 减少内存使用
if device == "cuda":
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
token=TOKEN
).to(device)
else: # CPU
model = AutoModelForCausalLM.from_pretrained(model_id, token=TOKEN)
loaded_models[model_id] = {"model": model, "tokenizer": tokenizer, "name": model_name}
print(f"成功加载模型: {model_name}")
except Exception as e:
print(f"加载模型 {model_name} ({model_id}) 失败: {e}")
# 如果加载失败,将该模型从比较列表中移除
# 或者将其模型对象设置为 None,以便在推理时跳过
loaded_models[model_id] = None # 表示加载失败
# 在应用程序启动时加载模型
# 注意:在 Gradio Blocks 的 launch() 之前调用,确保模型在界面初始化前加载
load_models()
# 模型生成函数
def generate_text(prompt, max_new_tokens=100):
outputs = {}
for model_info in MODELS_TO_COMPARE: # 迭代 MODELS_TO_COMPARE 确保顺序和输出框对应
model_id = model_info["id"]
model_name = model_info["name"]
model_data = loaded_models.get(model_id) # 从 loaded_models 获取数据
if model_data: # 确保模型已成功加载
model = model_data["model"]
tokenizer = model_data["tokenizer"]
try:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# print(f"Generating with {model_name} on device: {model.device}")
# 调整 generation_config 参数以获得更好的可控性
generated_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True, # 启用采样
temperature=0.7, # 控制生成文本的随机性
top_k=50, # 从概率最高的k个词中选择
top_p=0.95, # 累积概率达到p的词中选择
pad_token_id=tokenizer.eos_token_id, # 处理 pad token
eos_token_id=tokenizer.eos_token_id # 结束标志
)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
outputs[model_name] = generated_text
except Exception as e:
outputs[model_name] = f"生成失败: {e}"
else:
outputs[model_name] = "模型未加载或加载失败。"
# 按照 MODELS_TO_COMPARE 的顺序返回结果
ordered_outputs = [outputs.get(m["name"], "模型未加载或加载失败。") for m in MODELS_TO_COMPARE]
return ordered_outputs # 返回一个列表,对应多个输出框
# 更新排行榜数据函数
def update_leaderboard(g_score, r_score, a_score, c_score, e_score, model_idx):
global LEADERBOARD_DF
# 假设模型的索引与 MODELS_TO_COMPARE 列表中的顺序一致
# 在实际应用中,您可能需要更健壮的方式来匹配模型
if model_idx is not None and 0 <= model_idx < len(MODELS_TO_COMPARE):
model_name_to_update = MODELS_TO_COMPARE[model_idx]["name"]
# 找到 DataFrame 中对应的行
row_index = LEADERBOARD_DF[LEADERBOARD_DF['Model'] == model_name_to_update].index
if not row_index.empty:
# 更新 GRACE 分数 (这里假设是从 0.0-1.0 的分数,Gradio 滑块可能输出 0-100)
# 如果 Gradio 滑块输出 0-100,需要除以 100 转换为 0-1.0
LEADERBOARD_DF.loc[row_index, 'G: 泛化性'] = g_score / 100.0
LEADERBOARD_DF.loc[row_index, 'R: 相关性'] = r_score / 100.0
LEADERBOARD_DF.loc[row_index, 'A: 创新表现力'] = a_score / 100.0
LEADERBOARD_DF.loc[row_index, 'C: 一致性'] = c_score / 100.0
LEADERBOARD_DF.loc[row_index, 'E: 效率性'] = e_score / 100.0
# 重新排序排行榜 (如果需要根据某个分数排序,例如泛化性)
LEADERBOARD_DF = LEADERBOARD_DF.sort_values(by="G: 泛化性", ascending=False).reset_index(drop=True)
return LEADERBOARD_DF
return LEADERBOARD_DF # 返回更新后的 DataFrame
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def init_leaderboard(dataframe):
if dataframe is None or dataframe.empty:
print("Leaderboard DataFrame 为空或 None,初始化空排行榜。")
return Leaderboard(
value=pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)]),
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
label="选择要显示的列:",
),
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=[],
bool_checkboxgroup_label="隐藏模型",
interactive=False, # 设置为非交互式
)
return Leaderboard(
value=dataframe,
datatype=[c.type for c in fields(AutoEvalColumn)],
select_columns=SelectColumns(
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
label="选择要显示的列:",
),
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
filter_columns=[
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="模型类型"),
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="精度"),
ColumnFilter(
AutoEvalColumn.params.name,
type="slider",
min=0.01,
max=150,
label="选择参数数量 (B)",
),
ColumnFilter(
AutoEvalColumn.still_on_hub.name, type="boolean", label="已删除/不完整", default=True
),
# 为 GRACE 分数添加筛选器 (滑块)
ColumnFilter(
AutoEvalColumn.generalization_score.value.name,
type="slider",
min=0.0,
max=1.0,
label="G: 泛化性得分",
step=0.01 # 允许小数
),
ColumnFilter(
AutoEvalColumn.relevance_score.value.name,
type="slider",
min=0.0,
max=1.0,
label="R: 相关性得分",
step=0.01
),
ColumnFilter(
AutoEvalColumn.artistry_score.value.name,
type="slider",
min=0.0,
max=1.0,
label="A: 创新表现力得分",
step=0.01
),
ColumnFilter(
AutoEvalColumn.consistency_score.value.name,
type="slider",
min=0.0,
max=1.0,
label="C: 一致性得分",
step=0.01
),
ColumnFilter(
AutoEvalColumn.efficiency_score.value.name,
type="slider",
min=0.0,
max=1.0,
label="E: 效率性得分",
step=0.01
),
],
bool_checkboxgroup_label="隐藏模型",
interactive=False, # 设置为非交互式
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("💬 模型比较与生成", elem_id="model-comparison-tab", id=0): # 新的标签页
gr.Markdown("## 输入您的提示,查看不同模型的生成效果!", elem_classes="markdown-text")
with gr.Row():
input_prompt = gr.Textbox(label="输入提示词", placeholder="请写一首关于春天的诗歌。", lines=3)
generate_button = gr.Button("生成文本")
# 创建多个输出框,每个模型一个
output_boxes = []
for model_info in MODELS_TO_COMPARE:
output_boxes.append(gr.Textbox(label=f"{model_info['name']} 的生成结果", lines=5, interactive=False))
# 将生成按钮与 generate_text 函数连接
generate_button.click(
fn=generate_text,
inputs=[input_prompt],
outputs=output_boxes
)
gr.Markdown("## 手动评估 GRACE 维度", elem_classes="markdown-text")
gr.Markdown("请手动评估上述生成结果,并更新排行榜中的 GRACE 分数。", elem_classes="markdown-text")
# 用于选择要评估的模型
model_selector = gr.Dropdown(
choices=[(m["name"], idx) for idx, m in enumerate(MODELS_TO_COMPARE)],
label="选择要评估的模型",
interactive=True,
value=MODELS_TO_COMPARE[0]["name"] if MODELS_TO_COMPARE else None # 默认选中第一个模型
)
# GRACE 维度滑块
with gr.Column():
generalization_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="G: 泛化性得分 (0-100)")
relevance_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="R: 相关性得分 (0-100)")
artistry_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="A: 创新表现力得分 (0-100)")
consistency_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="C: 一致性得分 (0-100)")
efficiency_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="E: 效率性得分 (0-100)")
update_grace_button = gr.Button("更新 GRACE 评分到排行榜")
# Leaderboard 组件需要在被引用的地方先定义
leaderboard = init_leaderboard(LEADERBOARD_DF) # 在这里初始化 Leaderboard 组件
# 更新排行榜的逻辑
update_grace_button.click(
fn=update_leaderboard,
inputs=[
generalization_slider,
relevance_slider,
artistry_slider,
consistency_slider,
efficiency_slider,
model_selector # 传递选中的模型索引
],
outputs=leaderboard # 更新 Leaderboard 组件
)
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=1): # 调整 ID
# Leaderboard 已经在一开始初始化了,这里只是再次引用
leaderboard_display = leaderboard # 将初始化后的 Leaderboard 实例赋给一个新的变量以便在这里显示
with gr.TabItem("📝 关于", elem_id="llm-benchmark-tab-table", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("🚀 在此提交!", elem_id="llm-benchmark-tab-table", id=3): # 这个标签页保留,但内容将被简化
gr.Markdown("## 在此演示中,模型已预先加载进行比较,无需提交新模型。", elem_classes="markdown-text")
gr.Markdown("您可以在 **💬 模型比较与生成** 标签页中输入提示词并评估模型。", elem_classes="markdown-text")
gr.Markdown("(本页面仅用于保留原始结构,实际提交功能已禁用)")
with gr.Row():
with gr.Accordion("📙 引用", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
# 调度器,每 30 分钟重启一次 Space
# 在此演示中,由于模型是预加载的,并且没有持续的评估队列,重启的意义不大,但保留。
scheduler = BackgroundScheduler()
scheduler.add_job(API.restart_space, "interval", seconds=1800, args=[REPO_ID])
scheduler.start()
demo.queue(default_concurrency_limit=1).launch() # 降低并发限制,避免内存溢出