Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns | |
| import pandas as pd | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| import os | |
| import json | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch # 导入 torch | |
| # 从现有的 src 导入,这些我们无法修改,但需要继续使用其提供的功能 | |
| from src.about import ( | |
| CITATION_BUTTON_LABEL, | |
| CITATION_BUTTON_TEXT, | |
| EVALUATION_QUEUE_TEXT, # 这个可能不再需要,但保留以防万一 | |
| INTRODUCTION_TEXT, | |
| LLM_BENCHMARKS_TEXT, | |
| TITLE, | |
| ) | |
| from src.display.css_html_js import custom_css | |
| # ===================================================================== | |
| # **重要修改开始:直接在 app.py 中定义 GRACE 相关的类和函数** | |
| # ===================================================================== | |
| from enum import Enum | |
| from typing import NamedTuple, List | |
| class Column(NamedTuple): | |
| name: str | |
| type: str | |
| displayed_by_default: bool = True | |
| never_hidden: bool = False | |
| hidden: bool = False | |
| filterable: bool = True | |
| class AutoEvalColumn(Enum): | |
| model = Column("Model", "str", displayed_by_default=True, never_hidden=True) | |
| model_type = Column("Model type", "str", displayed_by_default=True) | |
| precision = Column("Precision", "str", displayed_by_default=False) | |
| params = Column("Params (B)", "number", displayed_by_default=True) | |
| license = Column("License", "str", displayed_by_default=False) | |
| still_on_hub = Column("On Hub", "boolean", displayed_by_default=True, hidden=True) | |
| # GRACE 框架新增列 | |
| generalization_score = Column("G: 泛化性", "number", displayed_by_default=True, filterable=True) | |
| relevance_score = Column("R: 相关性", "number", displayed_by_default=True, filterable=True) | |
| artistry_score = Column("A: 创新表现力", "number", displayed_by_default=True, filterable=True) | |
| consistency_score = Column("C: 一致性", "number", displayed_by_default=True, filterable=True) | |
| efficiency_score = Column("E: 效率性", "number", displayed_by_default=True, filterable=True) | |
| def fields(cls: type) -> List[Column]: | |
| return [c.value for c in cls if isinstance(c.value, Column)] | |
| class ModelType(Enum): | |
| LanguageModeling = "语言生成模型" | |
| ImageGeneration = "图像生成模型" | |
| Unknown = "未知" | |
| def to_str(self, sep: str = " : ") -> str: | |
| return f"{self.name}{sep}{self.value}" | |
| class WeightType(Enum): | |
| Original = NamedTuple("Original", [("name", str)])("Original") | |
| Lora = NamedTuple("Lora", [("name", str)])("Lora") | |
| class Precision(Enum): | |
| float16 = NamedTuple("float16", [("name", str)])("float16") | |
| bfloat16 = NamedTuple("bfloat16", [("name", str)])("bfloat16") | |
| Unknown = NamedTuple("Unknown", [("name", str)])("Unknown") | |
| COLS = fields(AutoEvalColumn) | |
| BENCHMARK_COLS = [ | |
| AutoEvalColumn.model.value, | |
| AutoEvalColumn.params.value, | |
| AutoEvalColumn.generalization_score.value, | |
| AutoEvalColumn.relevance_score.value, | |
| AutoEvalColumn.artistry_score.value, | |
| AutoEvalColumn.consistency_score.value, | |
| AutoEvalColumn.efficiency_score.value, | |
| ] | |
| EVAL_COLS = [c.name for c in fields(AutoEvalColumn)] | |
| EVAL_TYPES = [c.type for c in fields(AutoEvalColumn)] | |
| # 简化 get_leaderboard_df 和 get_evaluation_queue_df | |
| # 由于我们是手动比较,而不是自动评估,这些函数更多是用于显示模拟数据 | |
| def get_leaderboard_df(eval_results_path: str, eval_requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame: | |
| print("使用模拟数据填充排行榜。") | |
| # 这里我们不再尝试从文件读取,直接生成模拟数据 | |
| all_results = [ | |
| { | |
| "Model": "Gemma 2B Instruct", # 使用友好的名称 | |
| "Model type": ModelType.LanguageModeling.to_str(), | |
| "Precision": Precision.float16.value.name, | |
| "Params (B)": 2.0, | |
| "License": "apache-2.0", | |
| "On Hub": True, | |
| "G: 泛化性": 0.0, # 初始为0,等待用户输入 | |
| "R: 相关性": 0.0, | |
| "A: 创新表现力": 0.0, | |
| "C: 一致性": 0.0, | |
| "E: 效率性": 0.0, | |
| }, | |
| { | |
| "Model": "Phi-2", # 使用友好的名称 | |
| "Model type": ModelType.LanguageModeling.to_str(), | |
| "Precision": Precision.float16.value.name, | |
| "Params (B)": 2.7, | |
| "License": "mit", | |
| "On Hub": True, | |
| "G: 泛化性": 0.0, | |
| "R: 相关性": 0.0, | |
| "A: 创新表现力": 0.0, | |
| "C: 一致性": 0.0, | |
| "E: 效率性": 0.0, | |
| }, | |
| { | |
| "Model": "GPT-Neo 125M", # 使用友好的名称 | |
| "Model type": ModelType.LanguageModeling.to_str(), | |
| "Precision": Precision.float16.value.name, | |
| "Params (B)": 0.125, | |
| "License": "apache-2.0", | |
| "On Hub": True, | |
| "G: 泛化性": 0.0, | |
| "R: 相关性": 0.0, | |
| "A: 创新表现力": 0.0, | |
| "C: 一致性": 0.0, | |
| "E: 效率性": 0.0, | |
| } | |
| ] | |
| df = pd.DataFrame(all_results) | |
| # 对 DataFrame 进行必要的处理,例如排序 (这里不需要排序因为分数是0) | |
| return df | |
| def get_evaluation_queue_df(eval_requests_path: str, eval_cols: list): | |
| # 评估队列不再是主要功能,返回空 DataFrame | |
| empty_df = pd.DataFrame(columns=eval_cols) | |
| return empty_df, empty_df, empty_df | |
| # ===================================================================== | |
| # **重要修改结束** | |
| # ===================================================================== | |
| # 假设 src.envs 中的 API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN 可用 | |
| # 如果 TOKEN 未在 src.envs 中定义,您需要在 Hugging Face Space Secrets 中设置 HF_TOKEN。 | |
| # 这里为了能运行,我们直接使用 os.getenv 获取 TOKEN。 | |
| TOKEN = os.getenv("HF_TOKEN") # 确保您的 Space Secrets 中设置了 HF_TOKEN | |
| # 假设这些路径是可写的,但在此场景下,我们不再依赖它们来存储评估结果 | |
| EVAL_REQUESTS_PATH = "./eval_requests" | |
| EVAL_RESULTS_PATH = "./eval_results" | |
| # 对于演示,我们不需要实际的 API 调用来重启 Space 或提交任务 | |
| # 所以我们可以创建一个模拟的 API 类 | |
| class MockAPI: | |
| def restart_space(self, repo_id: str): | |
| print(f"MockAPI: Restarting space {repo_id}. (No actual restart for demo)") | |
| class MockSubmit: | |
| def add_new_eval(self, *args): | |
| # 这个函数不再用于实际提交,可以返回一个消息 | |
| return "在此演示中,模型已预先加载,无需提交新评估。" | |
| API = MockAPI() | |
| add_new_eval = MockSubmit().add_new_eval | |
| REPO_ID = os.getenv("HF_SPACE_ID", "your-org/your-space-name") # 从环境变量获取 Space ID,或者设置默认值 | |
| # 预加载模型和分词器 | |
| # 考虑到免费 Space 的资源限制,这里选择较小的模型 | |
| MODELS_TO_COMPARE = [ | |
| {"id": "google/gemma-2b-it", "name": "Gemma 2B Instruct"}, | |
| {"id": "microsoft/phi-2", "name": "Phi-2"}, | |
| {"id": "EleutherAI/gpt-neo-125m", "name": "GPT-Neo 125M"}, # 更小的模型,确保加载 | |
| ] | |
| # 用于存储加载的模型和分词器 | |
| loaded_models = {} | |
| def load_models(): | |
| global loaded_models | |
| for model_info in MODELS_TO_COMPARE: | |
| model_id = model_info["id"] | |
| model_name = model_info["name"] | |
| print(f"正在加载模型: {model_name} ({model_id})...") | |
| try: | |
| # 尝试加载模型到 GPU (cuda) 或 CPU (cpu) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"模型 {model_id} 将加载到 {device}") | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, token=TOKEN) | |
| # 使用 torch.float16 或 torch.bfloat16 减少内存使用 | |
| if device == "cuda": | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16, | |
| token=TOKEN | |
| ).to(device) | |
| else: # CPU | |
| model = AutoModelForCausalLM.from_pretrained(model_id, token=TOKEN) | |
| loaded_models[model_id] = {"model": model, "tokenizer": tokenizer, "name": model_name} | |
| print(f"成功加载模型: {model_name}") | |
| except Exception as e: | |
| print(f"加载模型 {model_name} ({model_id}) 失败: {e}") | |
| # 如果加载失败,将该模型从比较列表中移除 | |
| # 或者将其模型对象设置为 None,以便在推理时跳过 | |
| loaded_models[model_id] = None # 表示加载失败 | |
| # 在应用程序启动时加载模型 | |
| # 注意:在 Gradio Blocks 的 launch() 之前调用,确保模型在界面初始化前加载 | |
| load_models() | |
| # 模型生成函数 | |
| def generate_text(prompt, max_new_tokens=100): | |
| outputs = {} | |
| for model_info in MODELS_TO_COMPARE: # 迭代 MODELS_TO_COMPARE 确保顺序和输出框对应 | |
| model_id = model_info["id"] | |
| model_name = model_info["name"] | |
| model_data = loaded_models.get(model_id) # 从 loaded_models 获取数据 | |
| if model_data: # 确保模型已成功加载 | |
| model = model_data["model"] | |
| tokenizer = model_data["tokenizer"] | |
| try: | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| # print(f"Generating with {model_name} on device: {model.device}") | |
| # 调整 generation_config 参数以获得更好的可控性 | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=True, # 启用采样 | |
| temperature=0.7, # 控制生成文本的随机性 | |
| top_k=50, # 从概率最高的k个词中选择 | |
| top_p=0.95, # 累积概率达到p的词中选择 | |
| pad_token_id=tokenizer.eos_token_id, # 处理 pad token | |
| eos_token_id=tokenizer.eos_token_id # 结束标志 | |
| ) | |
| generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) | |
| outputs[model_name] = generated_text | |
| except Exception as e: | |
| outputs[model_name] = f"生成失败: {e}" | |
| else: | |
| outputs[model_name] = "模型未加载或加载失败。" | |
| # 按照 MODELS_TO_COMPARE 的顺序返回结果 | |
| ordered_outputs = [outputs.get(m["name"], "模型未加载或加载失败。") for m in MODELS_TO_COMPARE] | |
| return ordered_outputs # 返回一个列表,对应多个输出框 | |
| # 更新排行榜数据函数 | |
| def update_leaderboard(g_score, r_score, a_score, c_score, e_score, model_idx): | |
| global LEADERBOARD_DF | |
| # 假设模型的索引与 MODELS_TO_COMPARE 列表中的顺序一致 | |
| # 在实际应用中,您可能需要更健壮的方式来匹配模型 | |
| if model_idx is not None and 0 <= model_idx < len(MODELS_TO_COMPARE): | |
| model_name_to_update = MODELS_TO_COMPARE[model_idx]["name"] | |
| # 找到 DataFrame 中对应的行 | |
| row_index = LEADERBOARD_DF[LEADERBOARD_DF['Model'] == model_name_to_update].index | |
| if not row_index.empty: | |
| # 更新 GRACE 分数 (这里假设是从 0.0-1.0 的分数,Gradio 滑块可能输出 0-100) | |
| # 如果 Gradio 滑块输出 0-100,需要除以 100 转换为 0-1.0 | |
| LEADERBOARD_DF.loc[row_index, 'G: 泛化性'] = g_score / 100.0 | |
| LEADERBOARD_DF.loc[row_index, 'R: 相关性'] = r_score / 100.0 | |
| LEADERBOARD_DF.loc[row_index, 'A: 创新表现力'] = a_score / 100.0 | |
| LEADERBOARD_DF.loc[row_index, 'C: 一致性'] = c_score / 100.0 | |
| LEADERBOARD_DF.loc[row_index, 'E: 效率性'] = e_score / 100.0 | |
| # 重新排序排行榜 (如果需要根据某个分数排序,例如泛化性) | |
| LEADERBOARD_DF = LEADERBOARD_DF.sort_values(by="G: 泛化性", ascending=False).reset_index(drop=True) | |
| return LEADERBOARD_DF | |
| return LEADERBOARD_DF # 返回更新后的 DataFrame | |
| LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) | |
| ( | |
| finished_eval_queue_df, | |
| running_eval_queue_df, | |
| pending_eval_queue_df, | |
| ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) | |
| def init_leaderboard(dataframe): | |
| if dataframe is None or dataframe.empty: | |
| print("Leaderboard DataFrame 为空或 None,初始化空排行榜。") | |
| return Leaderboard( | |
| value=pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)]), | |
| datatype=[c.type for c in fields(AutoEvalColumn)], | |
| select_columns=SelectColumns( | |
| default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], | |
| cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], | |
| label="选择要显示的列:", | |
| ), | |
| search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], | |
| hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], | |
| filter_columns=[], | |
| bool_checkboxgroup_label="隐藏模型", | |
| interactive=False, # 设置为非交互式 | |
| ) | |
| return Leaderboard( | |
| value=dataframe, | |
| datatype=[c.type for c in fields(AutoEvalColumn)], | |
| select_columns=SelectColumns( | |
| default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], | |
| cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], | |
| label="选择要显示的列:", | |
| ), | |
| search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], | |
| hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden], | |
| filter_columns=[ | |
| ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="模型类型"), | |
| ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="精度"), | |
| ColumnFilter( | |
| AutoEvalColumn.params.name, | |
| type="slider", | |
| min=0.01, | |
| max=150, | |
| label="选择参数数量 (B)", | |
| ), | |
| ColumnFilter( | |
| AutoEvalColumn.still_on_hub.name, type="boolean", label="已删除/不完整", default=True | |
| ), | |
| # 为 GRACE 分数添加筛选器 (滑块) | |
| ColumnFilter( | |
| AutoEvalColumn.generalization_score.value.name, | |
| type="slider", | |
| min=0.0, | |
| max=1.0, | |
| label="G: 泛化性得分", | |
| step=0.01 # 允许小数 | |
| ), | |
| ColumnFilter( | |
| AutoEvalColumn.relevance_score.value.name, | |
| type="slider", | |
| min=0.0, | |
| max=1.0, | |
| label="R: 相关性得分", | |
| step=0.01 | |
| ), | |
| ColumnFilter( | |
| AutoEvalColumn.artistry_score.value.name, | |
| type="slider", | |
| min=0.0, | |
| max=1.0, | |
| label="A: 创新表现力得分", | |
| step=0.01 | |
| ), | |
| ColumnFilter( | |
| AutoEvalColumn.consistency_score.value.name, | |
| type="slider", | |
| min=0.0, | |
| max=1.0, | |
| label="C: 一致性得分", | |
| step=0.01 | |
| ), | |
| ColumnFilter( | |
| AutoEvalColumn.efficiency_score.value.name, | |
| type="slider", | |
| min=0.0, | |
| max=1.0, | |
| label="E: 效率性得分", | |
| step=0.01 | |
| ), | |
| ], | |
| bool_checkboxgroup_label="隐藏模型", | |
| interactive=False, # 设置为非交互式 | |
| ) | |
| demo = gr.Blocks(css=custom_css) | |
| with demo: | |
| gr.HTML(TITLE) | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.TabItem("💬 模型比较与生成", elem_id="model-comparison-tab", id=0): # 新的标签页 | |
| gr.Markdown("## 输入您的提示,查看不同模型的生成效果!", elem_classes="markdown-text") | |
| with gr.Row(): | |
| input_prompt = gr.Textbox(label="输入提示词", placeholder="请写一首关于春天的诗歌。", lines=3) | |
| generate_button = gr.Button("生成文本") | |
| # 创建多个输出框,每个模型一个 | |
| output_boxes = [] | |
| for model_info in MODELS_TO_COMPARE: | |
| output_boxes.append(gr.Textbox(label=f"{model_info['name']} 的生成结果", lines=5, interactive=False)) | |
| # 将生成按钮与 generate_text 函数连接 | |
| generate_button.click( | |
| fn=generate_text, | |
| inputs=[input_prompt], | |
| outputs=output_boxes | |
| ) | |
| gr.Markdown("## 手动评估 GRACE 维度", elem_classes="markdown-text") | |
| gr.Markdown("请手动评估上述生成结果,并更新排行榜中的 GRACE 分数。", elem_classes="markdown-text") | |
| # 用于选择要评估的模型 | |
| model_selector = gr.Dropdown( | |
| choices=[(m["name"], idx) for idx, m in enumerate(MODELS_TO_COMPARE)], | |
| label="选择要评估的模型", | |
| interactive=True, | |
| value=MODELS_TO_COMPARE[0]["name"] if MODELS_TO_COMPARE else None # 默认选中第一个模型 | |
| ) | |
| # GRACE 维度滑块 | |
| with gr.Column(): | |
| generalization_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="G: 泛化性得分 (0-100)") | |
| relevance_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="R: 相关性得分 (0-100)") | |
| artistry_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="A: 创新表现力得分 (0-100)") | |
| consistency_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="C: 一致性得分 (0-100)") | |
| efficiency_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="E: 效率性得分 (0-100)") | |
| update_grace_button = gr.Button("更新 GRACE 评分到排行榜") | |
| # Leaderboard 组件需要在被引用的地方先定义 | |
| leaderboard = init_leaderboard(LEADERBOARD_DF) # 在这里初始化 Leaderboard 组件 | |
| # 更新排行榜的逻辑 | |
| update_grace_button.click( | |
| fn=update_leaderboard, | |
| inputs=[ | |
| generalization_slider, | |
| relevance_slider, | |
| artistry_slider, | |
| consistency_slider, | |
| efficiency_slider, | |
| model_selector # 传递选中的模型索引 | |
| ], | |
| outputs=leaderboard # 更新 Leaderboard 组件 | |
| ) | |
| with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=1): # 调整 ID | |
| # Leaderboard 已经在一开始初始化了,这里只是再次引用 | |
| leaderboard_display = leaderboard # 将初始化后的 Leaderboard 实例赋给一个新的变量以便在这里显示 | |
| with gr.TabItem("📝 关于", elem_id="llm-benchmark-tab-table", id=2): | |
| gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") | |
| with gr.TabItem("🚀 在此提交!", elem_id="llm-benchmark-tab-table", id=3): # 这个标签页保留,但内容将被简化 | |
| gr.Markdown("## 在此演示中,模型已预先加载进行比较,无需提交新模型。", elem_classes="markdown-text") | |
| gr.Markdown("您可以在 **💬 模型比较与生成** 标签页中输入提示词并评估模型。", elem_classes="markdown-text") | |
| gr.Markdown("(本页面仅用于保留原始结构,实际提交功能已禁用)") | |
| with gr.Row(): | |
| with gr.Accordion("📙 引用", open=False): | |
| citation_button = gr.Textbox( | |
| value=CITATION_BUTTON_TEXT, | |
| label=CITATION_BUTTON_LABEL, | |
| lines=20, | |
| elem_id="citation-button", | |
| show_copy_button=True, | |
| ) | |
| # 调度器,每 30 分钟重启一次 Space | |
| # 在此演示中,由于模型是预加载的,并且没有持续的评估队列,重启的意义不大,但保留。 | |
| scheduler = BackgroundScheduler() | |
| scheduler.add_job(API.restart_space, "interval", seconds=1800, args=[REPO_ID]) | |
| scheduler.start() | |
| demo.queue(default_concurrency_limit=1).launch() # 降低并发限制,避免内存溢出 |