hellokawei commited on
Commit
c607ad4
·
verified ·
1 Parent(s): ced1d09

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +397 -440
app.py CHANGED
@@ -1,456 +1,413 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- import os
 
 
 
 
6
  import json
7
- from transformers import AutoModelForCausalLM, AutoTokenizer
8
- import torch # 导入 torch
9
-
10
- # 从现有的 src 导入,这些我们无法修改,但需要继续使用其提供的功能
11
- from src.about import (
12
- CITATION_BUTTON_LABEL,
13
- CITATION_BUTTON_TEXT,
14
- EVALUATION_QUEUE_TEXT, # 这个可能不再需要,但保留以防万一
15
- INTRODUCTION_TEXT,
16
- LLM_BENCHMARKS_TEXT,
17
- TITLE,
18
- )
19
- from src.display.css_html_js import custom_css
20
-
21
- # =====================================================================
22
- # **重要修改开始:直接在 app.py 中定义 GRACE 相关的类和函数**
23
- # =====================================================================
24
-
25
- from enum import Enum
26
- from typing import NamedTuple, List
27
-
28
- class Column(NamedTuple):
29
- name: str
30
- type: str
31
- displayed_by_default: bool = True
32
- never_hidden: bool = False
33
- hidden: bool = False
34
- filterable: bool = True
35
-
36
- class AutoEvalColumn(Enum):
37
- model = Column("Model", "str", displayed_by_default=True, never_hidden=True)
38
- model_type = Column("Model type", "str", displayed_by_default=True)
39
- precision = Column("Precision", "str", displayed_by_default=False)
40
- params = Column("Params (B)", "number", displayed_by_default=True)
41
- license = Column("License", "str", displayed_by_default=False)
42
- still_on_hub = Column("On Hub", "boolean", displayed_by_default=True, hidden=True)
43
-
44
- # GRACE 框架新增列
45
- generalization_score = Column("G: 泛化性", "number", displayed_by_default=True, filterable=True)
46
- relevance_score = Column("R: 相关性", "number", displayed_by_default=True, filterable=True)
47
- artistry_score = Column("A: 创新表现力", "number", displayed_by_default=True, filterable=True)
48
- consistency_score = Column("C: 一致性", "number", displayed_by_default=True, filterable=True)
49
- efficiency_score = Column("E: 效率性", "number", displayed_by_default=True, filterable=True)
50
-
51
- def fields(cls: type) -> List[Column]:
52
- return [c.value for c in cls if isinstance(c.value, Column)]
53
-
54
- class ModelType(Enum):
55
- LanguageModeling = "语言生成模型"
56
- ImageGeneration = "图像生成模型"
57
- Unknown = "未知"
58
-
59
- def to_str(self, sep: str = " : ") -> str:
60
- return f"{self.name}{sep}{self.value}"
61
-
62
- class WeightType(Enum):
63
- Original = NamedTuple("Original", [("name", str)])("Original")
64
- Lora = NamedTuple("Lora", [("name", str)])("Lora")
65
-
66
- class Precision(Enum):
67
- float16 = NamedTuple("float16", [("name", str)])("float16")
68
- bfloat16 = NamedTuple("bfloat16", [("name", str)])("bfloat16")
69
- Unknown = NamedTuple("Unknown", [("name", str)])("Unknown")
70
-
71
- COLS = fields(AutoEvalColumn)
72
- BENCHMARK_COLS = [
73
- AutoEvalColumn.model.value,
74
- AutoEvalColumn.params.value,
75
- AutoEvalColumn.generalization_score.value,
76
- AutoEvalColumn.relevance_score.value,
77
- AutoEvalColumn.artistry_score.value,
78
- AutoEvalColumn.consistency_score.value,
79
- AutoEvalColumn.efficiency_score.value,
80
- ]
81
- EVAL_COLS = [c.name for c in fields(AutoEvalColumn)]
82
- EVAL_TYPES = [c.type for c in fields(AutoEvalColumn)]
83
-
84
- # 简化 get_leaderboard_df 和 get_evaluation_queue_df
85
- # 由于我们是手动比较,而不是自动评估,这些函数更多是用于显示模拟数据
86
- def get_leaderboard_df(eval_results_path: str, eval_requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
87
- print("使用模拟数据填充排行榜。")
88
- # 这里我们不再尝试从文件读取,直接生成模拟数据
89
- all_results = [
90
- {
91
- "Model": "Gemma 2B Instruct", # 使用友好的名称
92
- "Model type": ModelType.LanguageModeling.to_str(),
93
- "Precision": Precision.float16.value.name,
94
- "Params (B)": 2.0,
95
- "License": "apache-2.0",
96
- "On Hub": True,
97
- "G: 泛化性": 0.0, # 初始为0,等待用户输入
98
- "R: 相关性": 0.0,
99
- "A: 创新表现力": 0.0,
100
- "C: 一致性": 0.0,
101
- "E: 效率性": 0.0,
102
- },
103
- {
104
- "Model": "Phi-2", # 使用友好的名称
105
- "Model type": ModelType.LanguageModeling.to_str(),
106
- "Precision": Precision.float16.value.name,
107
- "Params (B)": 2.7,
108
- "License": "mit",
109
- "On Hub": True,
110
- "G: 泛化性": 0.0,
111
- "R: 相关性": 0.0,
112
- "A: 创新表现力": 0.0,
113
- "C: 一致性": 0.0,
114
- "E: 效率性": 0.0,
115
- },
116
- {
117
- "Model": "GPT-Neo 125M", # 使用友好的名称
118
- "Model type": ModelType.LanguageModeling.to_str(),
119
- "Precision": Precision.float16.value.name,
120
- "Params (B)": 0.125,
121
- "License": "apache-2.0",
122
- "On Hub": True,
123
- "G: 泛化性": 0.0,
124
- "R: 相关性": 0.0,
125
- "A: 创新表现力": 0.0,
126
- "C: 一致性": 0.0,
127
- "E: 效率性": 0.0,
128
- }
129
- ]
130
- df = pd.DataFrame(all_results)
131
- # 对 DataFrame 进行必要的处理,例如排序 (这里不需要排序因为分数是0)
132
- return df
133
-
134
- def get_evaluation_queue_df(eval_requests_path: str, eval_cols: list):
135
- # 评估队列不再是主要功能,返回空 DataFrame
136
- empty_df = pd.DataFrame(columns=eval_cols)
137
- return empty_df, empty_df, empty_df
138
-
139
- # =====================================================================
140
- # **重要修改结束**
141
- # =====================================================================
142
-
143
- # 假设 src.envs 中的 API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN 可用
144
- # 如果 TOKEN 未在 src.envs 中定义,您需要在 Hugging Face Space Secrets 中设置 HF_TOKEN。
145
- # 这里为了能运行,我们直接使用 os.getenv 获取 TOKEN。
146
- TOKEN = os.getenv("HF_TOKEN") # 确保您的 Space Secrets 中设置了 HF_TOKEN
147
- # 假设这些路径是可写的,但在此场景下,我们不再依赖它们来存储评估结果
148
- EVAL_REQUESTS_PATH = "./eval_requests"
149
- EVAL_RESULTS_PATH = "./eval_results"
150
- # 对于演示,我们不需要实际的 API 调用来重启 Space 或提交任务
151
- # 所以我们可以创建一个模拟的 API 类
152
- class MockAPI:
153
- def restart_space(self, repo_id: str):
154
- print(f"MockAPI: Restarting space {repo_id}. (No actual restart for demo)")
155
- class MockSubmit:
156
- def add_new_eval(self, *args):
157
- # 这个函数不再用于实际提交,可以返回一个消息
158
- return "在此演示中,模型已预先加载,无需提交新评估。"
159
-
160
- API = MockAPI()
161
- add_new_eval = MockSubmit().add_new_eval
162
- REPO_ID = os.getenv("HF_SPACE_ID", "your-org/your-space-name") # 从环境变量获取 Space ID,或者设置默认值
163
-
164
- # 预加载模型和分词器
165
- # 考虑到免费 Space 的资源限制,这里选择较小的模型
166
- MODELS_TO_COMPARE = [
167
- {"id": "google/gemma-2b-it", "name": "Gemma 2B Instruct"},
168
- {"id": "microsoft/phi-2", "name": "Phi-2"},
169
- {"id": "EleutherAI/gpt-neo-125m", "name": "GPT-Neo 125M"}, # 更小的模型,确保加载
170
- ]
171
-
172
- # 用于存储加载的模型和分词器
173
- loaded_models = {}
174
-
175
- def load_models():
176
- global loaded_models
177
- for model_info in MODELS_TO_COMPARE:
178
- model_id = model_info["id"]
179
- model_name = model_info["name"]
180
- print(f"正在加载模型: {model_name} ({model_id})...")
181
- try:
182
- # 尝试加载模型到 GPU (cuda) 或 CPU (cpu)
183
- device = "cuda" if torch.cuda.is_available() else "cpu"
184
- print(f"模型 {model_id} 将加载到 {device}")
185
-
186
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=TOKEN)
187
- # 使用 torch.float16 或 torch.bfloat16 减少内存使用
188
- if device == "cuda":
189
- model = AutoModelForCausalLM.from_pretrained(
190
- model_id,
191
- torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
192
- token=TOKEN
193
- ).to(device)
194
- else: # CPU
195
- model = AutoModelForCausalLM.from_pretrained(model_id, token=TOKEN)
196
-
197
- loaded_models[model_id] = {"model": model, "tokenizer": tokenizer, "name": model_name}
198
- print(f"成功加载模型: {model_name}")
199
- except Exception as e:
200
- print(f"加载模型 {model_name} ({model_id}) 失败: {e}")
201
- # 如果加载失败,将该模型从比较列表中移除
202
- # 或者将其模型对象设置为 None,以便在推理时跳过
203
- loaded_models[model_id] = None # 表示加载失败
204
-
205
- # 在应用程序启动时加载模型
206
- # 注意:在 Gradio Blocks 的 launch() 之前调用,确保模型在界面初始化前加载
207
- load_models()
208
-
209
-
210
- # 模型生成函数
211
- def generate_text(prompt, max_new_tokens=100):
212
- outputs = {}
213
- for model_info in MODELS_TO_COMPARE: # 迭代 MODELS_TO_COMPARE 确保顺序和输出框对应
214
- model_id = model_info["id"]
215
- model_name = model_info["name"]
216
- model_data = loaded_models.get(model_id) # 从 loaded_models 获取数据
217
-
218
- if model_data: # 确保模型已成功加载
219
- model = model_data["model"]
220
- tokenizer = model_data["tokenizer"]
221
-
222
  try:
223
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
224
- # print(f"Generating with {model_name} on device: {model.device}")
225
- # 调整 generation_config 参数以获得更好的可控性
226
- generated_ids = model.generate(
227
- **inputs,
228
- max_new_tokens=max_new_tokens,
229
- do_sample=True, # 启用采样
230
- temperature=0.7, # 控制生成文本的随机性
231
- top_k=50, # 从概率最高的k个词中选择
232
- top_p=0.95, # 累积概率达到p的词中选择
233
- pad_token_id=tokenizer.eos_token_id, # 处理 pad token
234
- eos_token_id=tokenizer.eos_token_id # 结束标志
235
  )
236
- generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
237
- outputs[model_name] = generated_text
238
  except Exception as e:
239
- outputs[model_name] = f"生成失败: {e}"
240
- else:
241
- outputs[model_name] = "模型未加载或加载失败。"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
- # 按照 MODELS_TO_COMPARE 的顺序返回结果
244
- ordered_outputs = [outputs.get(m["name"], "模型未加载或加载失败。") for m in MODELS_TO_COMPARE]
245
- return ordered_outputs # 返回一个列表,对应多个输出框
246
-
247
- # 更新排行榜数据函数
248
- def update_leaderboard(g_score, r_score, a_score, c_score, e_score, model_idx):
249
- global LEADERBOARD_DF
250
- # 假设模型的索引与 MODELS_TO_COMPARE 列表中的顺序一致
251
- # 在实际应用中,您可能需要更健壮的方式来匹配模型
252
- if model_idx is not None and 0 <= model_idx < len(MODELS_TO_COMPARE):
253
- model_name_to_update = MODELS_TO_COMPARE[model_idx]["name"]
254
- # 找到 DataFrame 中对应的行
255
- row_index = LEADERBOARD_DF[LEADERBOARD_DF['Model'] == model_name_to_update].index
256
- if not row_index.empty:
257
- # 更新 GRACE 分数 (这里假设是从 0.0-1.0 的分数,Gradio 滑块可能输出 0-100)
258
- # 如果 Gradio 滑块输出 0-100,需要除以 100 转换为 0-1.0
259
- LEADERBOARD_DF.loc[row_index, 'G: 泛化性'] = g_score / 100.0
260
- LEADERBOARD_DF.loc[row_index, 'R: 相关性'] = r_score / 100.0
261
- LEADERBOARD_DF.loc[row_index, 'A: 创新表现力'] = a_score / 100.0
262
- LEADERBOARD_DF.loc[row_index, 'C: 一致性'] = c_score / 100.0
263
- LEADERBOARD_DF.loc[row_index, 'E: 效率性'] = e_score / 100.0
264
- # 重新排序排行榜 (如果需要根据某个分数排序,例如泛化性)
265
- LEADERBOARD_DF = LEADERBOARD_DF.sort_values(by="G: 泛化性", ascending=False).reset_index(drop=True)
266
- return LEADERBOARD_DF
267
- return LEADERBOARD_DF # 返回更新后的 DataFrame
268
-
269
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
270
- (
271
- finished_eval_queue_df,
272
- running_eval_queue_df,
273
- pending_eval_queue_df,
274
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
275
-
276
- def init_leaderboard(dataframe):
277
- if dataframe is None or dataframe.empty:
278
- print("Leaderboard DataFrame 为空或 None,初始化空排行榜。")
279
- return Leaderboard(
280
- value=pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)]),
281
- datatype=[c.type for c in fields(AutoEvalColumn)],
282
- select_columns=SelectColumns(
283
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
284
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
285
- label="选择要显示的列:",
286
- ),
287
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
288
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
289
- filter_columns=[],
290
- bool_checkboxgroup_label="隐藏模型",
291
- interactive=False, # 设置为非交互式
292
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
- return Leaderboard(
295
- value=dataframe,
296
- datatype=[c.type for c in fields(AutoEvalColumn)],
297
- select_columns=SelectColumns(
298
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
299
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
300
- label="选择要显示的列:",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  ),
302
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
303
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
304
- filter_columns=[
305
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="模型类型"),
306
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="精度"),
307
- ColumnFilter(
308
- AutoEvalColumn.params.name,
309
- type="slider",
310
- min=0.01,
311
- max=150,
312
- label="选择参数数量 (B)",
313
- ),
314
- ColumnFilter(
315
- AutoEvalColumn.still_on_hub.name, type="boolean", label="已删除/不完整", default=True
316
- ),
317
- # 为 GRACE 分数添加筛选器 (滑块)
318
- ColumnFilter(
319
- AutoEvalColumn.generalization_score.value.name,
320
- type="slider",
321
- min=0.0,
322
- max=1.0,
323
- label="G: 泛化性得分",
324
- step=0.01 # 允许小数
325
- ),
326
- ColumnFilter(
327
- AutoEvalColumn.relevance_score.value.name,
328
- type="slider",
329
- min=0.0,
330
- max=1.0,
331
- label="R: 相关性得分",
332
- step=0.01
333
- ),
334
- ColumnFilter(
335
- AutoEvalColumn.artistry_score.value.name,
336
- type="slider",
337
- min=0.0,
338
- max=1.0,
339
- label="A: 创新表现力得分",
340
- step=0.01
341
- ),
342
- ColumnFilter(
343
- AutoEvalColumn.consistency_score.value.name,
344
- type="slider",
345
- min=0.0,
346
- max=1.0,
347
- label="C: 一致性得分",
348
- step=0.01
349
- ),
350
- ColumnFilter(
351
- AutoEvalColumn.efficiency_score.value.name,
352
- type="slider",
353
- min=0.0,
354
- max=1.0,
355
- label="E: 效率性得分",
356
- step=0.01
357
- ),
358
- ],
359
- bool_checkboxgroup_label="隐藏模型",
360
- interactive=False, # 设置为非交互式
361
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
 
 
 
 
 
 
363
 
364
- demo = gr.Blocks(css=custom_css)
365
- with demo:
366
- gr.HTML(TITLE)
367
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
368
-
369
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
370
- with gr.TabItem("💬 模型比较与生成", elem_id="model-comparison-tab", id=0): # 新的标签页
371
- gr.Markdown("## 输入您的提示,查看不同模型的生成效果!", elem_classes="markdown-text")
372
- with gr.Row():
373
- input_prompt = gr.Textbox(label="输入提示词", placeholder="请写一首关于春天的诗歌。", lines=3)
374
- generate_button = gr.Button("生成文本")
375
-
376
- # 创建多个输出框,每个模型一个
377
- output_boxes = []
378
- for model_info in MODELS_TO_COMPARE:
379
- output_boxes.append(gr.Textbox(label=f"{model_info['name']} 的生成结果", lines=5, interactive=False))
380
-
381
- # 将生成按钮与 generate_text 函数连接
382
- generate_button.click(
383
- fn=generate_text,
384
- inputs=[input_prompt],
385
- outputs=output_boxes
386
- )
387
-
388
- gr.Markdown("## 手动评估 GRACE 维度", elem_classes="markdown-text")
389
- gr.Markdown("请手动评估上述生成结果,并更新排行榜中的 GRACE 分数。", elem_classes="markdown-text")
390
-
391
- # 用于选择要评估的模型
392
- model_selector = gr.Dropdown(
393
- choices=[(m["name"], idx) for idx, m in enumerate(MODELS_TO_COMPARE)],
394
- label="选择要评估的模型",
395
- interactive=True,
396
- value=MODELS_TO_COMPARE[0]["name"] if MODELS_TO_COMPARE else None # 默认选中第一个模型
397
- )
398
 
399
- # GRACE 维度滑块
400
- with gr.Column():
401
- generalization_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="G: 泛化性得分 (0-100)")
402
- relevance_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="R: 相关性得分 (0-100)")
403
- artistry_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="A: 创新表现力得分 (0-100)")
404
- consistency_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="C: 一致性得分 (0-100)")
405
- efficiency_slider = gr.Slider(minimum=0, maximum=100, step=1, value=75, label="E: 效率性得分 (0-100)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
 
407
- update_grace_button = gr.Button("更新 GRACE 评分到排行榜")
 
 
 
 
 
 
 
 
 
 
 
 
 
408
 
409
- # Leaderboard 组件需要在被引用的地方先定义
410
- leaderboard = init_leaderboard(LEADERBOARD_DF) # 在这里初始化 Leaderboard 组件
411
-
412
- # 更新排行榜的逻辑
413
- update_grace_button.click(
414
- fn=update_leaderboard,
415
- inputs=[
416
- generalization_slider,
417
- relevance_slider,
418
- artistry_slider,
419
- consistency_slider,
420
- efficiency_slider,
421
- model_selector # 传递选中的模型索引
422
- ],
423
- outputs=leaderboard # 更新 Leaderboard 组件
424
- )
425
-
426
-
427
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=1): # 调整 ID
428
- # Leaderboard 已经在一开始初始化了,这里只是再次引用
429
- leaderboard_display = leaderboard # 将初始化后的 Leaderboard 实例赋给一个新的变量以便在这里显示
430
-
431
- with gr.TabItem("📝 关于", elem_id="llm-benchmark-tab-table", id=2):
432
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
433
-
434
- with gr.TabItem("🚀 在此提交!", elem_id="llm-benchmark-tab-table", id=3): # 这个标签页保留,但内容将被简化
435
- gr.Markdown("## 在此演示中,模型已预先加载进行比较,无需提交新模型。", elem_classes="markdown-text")
436
- gr.Markdown("您可以在 **💬 模型比较与生成** 标签页中输入提示词并评估模型。", elem_classes="markdown-text")
437
- gr.Markdown("(本页面仅用于保留原始结构,实际提交功能已禁用)")
438
-
439
-
440
- with gr.Row():
441
- with gr.Accordion("📙 引用", open=False):
442
- citation_button = gr.Textbox(
443
- value=CITATION_BUTTON_TEXT,
444
- label=CITATION_BUTTON_LABEL,
445
- lines=20,
446
- elem_id="citation-button",
447
- show_copy_button=True,
448
- )
449
-
450
- # 调度器,每 30 分钟重启一次 Space
451
- # 在此演示中,由于模型是预加载的,并且没有持续的评估队列,重启的意义不大,但保留。
452
- scheduler = BackgroundScheduler()
453
- scheduler.add_job(API.restart_space, "interval", seconds=1800, args=[REPO_ID])
454
- scheduler.start()
455
-
456
- demo.queue(default_concurrency_limit=1).launch() # 降低并发限制,避免内存溢出
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import plotly.graph_objects as go
4
+ import plotly.express as px
5
+ import time
6
+ import numpy as np
7
+ from transformers import pipeline
8
+ import torch
9
  import json
10
+ import re
11
+
12
+ # 选择两个翻译模型
13
+ MODEL_CONFIGS = {
14
+ "English-to-Chinese": {
15
+ "model_name": "Helsinki-NLP/opus-mt-en-zh",
16
+ "description": "英文到中文的机器翻译模型 (Helsinki-NLP OPUS-MT)",
17
+ "max_length": 200, # 翻译输出的最大长度
18
+ "color": "#FF6B6B"
19
+ },
20
+ "Chinese-to-English": {
21
+ "model_name": "Helsinki-NLP/opus-mt-zh-en",
22
+ "description": "中文到英文的机器翻译模型 (Helsinki-NLP OPUS-MT)",
23
+ "max_length": 200, # 翻译输出的最大长度
24
+ "color": "#4ECDC4"
25
+ }
26
+ }
27
+
28
+ class TranslationComparator:
29
+ def __init__(self):
30
+ self.models = {}
31
+ self.load_models()
32
+
33
+ def load_models(self):
34
+ """加载所有翻译模型"""
35
+ print("正在加载翻译模型...")
36
+ for model_key, config in MODEL_CONFIGS.items():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  try:
38
+ print(f"加载 {model_key} ({config['model_name']})...")
39
+ # 对于翻译任务,使用 "translation" pipeline
40
+ # Gradio-spaces 上的 free tier 可能会导致内存不足,所以 device=-1 (CPU) 更稳妥
41
+ self.models[model_key] = pipeline(
42
+ "translation",
43
+ model=config["model_name"],
44
+ tokenizer=config["model_name"],
45
+ device=-1, # 使用CPU
46
+ torch_dtype=torch.float32 # 保持一致,或根据模型精度调整
 
 
 
47
  )
48
+ print(f"✓ {model_key} 加载成功")
 
49
  except Exception as e:
50
+ print(f" {model_key} 加载失败: {e}")
51
+ self.models[model_key] = None
52
+
53
+ def translate_text(self, model_key, text_to_translate, max_length=200):
54
+ """使用指定模型进行翻译"""
55
+ if self.models[model_key] is None:
56
+ return {
57
+ "translated_text": f"[Model {model_key} not loaded correctly, this is a simulated translation]",
58
+ "inference_time": 0.5,
59
+ "input_length": len(text_to_translate.split()),
60
+ "output_length": 50, # 模拟输出长度
61
+ "parameters": {
62
+ "max_length": max_length
63
+ }
64
+ }
65
+
66
+ try:
67
+ start_time = time.time()
68
+
69
+ # 翻译文本
70
+ # pipeline("translation") 的返回格式是 [{"translation_text": "..."}]
71
+ result = self.models[model_key](
72
+ text_to_translate,
73
+ max_length=max_length
74
+ )
75
+
76
+ end_time = time.time()
77
+
78
+ translated_text = result[0]['translation_text']
79
+
80
+ return {
81
+ "translated_text": translated_text,
82
+ "inference_time": round(end_time - start_time, 3),
83
+ "input_length": len(text_to_translate.split()),
84
+ "output_length": len(translated_text.split()),
85
+ "parameters": {
86
+ "max_length": max_length
87
+ }
88
+ }
89
+
90
+ except Exception as e:
91
+ return {
92
+ "error": f"翻译错误: {str(e)}",
93
+ "inference_time": 0,
94
+ "input_length": 0,
95
+ "output_length": 0
96
+ }
97
+
98
+ # 初始化比较器
99
+ comparator = TranslationComparator()
100
+
101
+ def run_translation_comparison(en_prompt, zh_prompt, max_length):
102
+ """运行所有模型的翻译对比"""
103
 
104
+ results = {}
105
+
106
+ # 英文到中文翻译
107
+ if "English-to-Chinese" in MODEL_CONFIGS and en_prompt.strip():
108
+ result_en_zh = comparator.translate_text(
109
+ "English-to-Chinese",
110
+ en_prompt,
111
+ max_length=int(max_length)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  )
113
+ results["English-to-Chinese"] = result_en_zh
114
+ else:
115
+ results["English-to-Chinese"] = {"error": "请输入英文文本进行翻译"} if not en_prompt.strip() else {}
116
+
117
+ # 中文到英文翻译
118
+ if "Chinese-to-English" in MODEL_CONFIGS and zh_prompt.strip():
119
+ result_zh_en = comparator.translate_text(
120
+ "Chinese-to-English",
121
+ zh_prompt,
122
+ max_length=int(max_length)
123
+ )
124
+ results["Chinese-to-English"] = result_zh_en
125
+ else:
126
+ results["Chinese-to-English"] = {"error": "请输入中文文本进行翻译"} if not zh_prompt.strip() else {}
127
+
128
+ # 格式化输出
129
+ def format_result(result):
130
+ if "error" in result:
131
+ return json.dumps({"错误信息": result["error"]}, indent=2, ensure_ascii=False)
132
+
133
+ formatted = {
134
+ "翻译文本": result["translated_text"],
135
+ "推断时间": f"{result['inference_time']}s",
136
+ "翻译Token数": result["output_length"],
137
+ "翻译速度": f"{result['output_length']/max(result['inference_time'], 0.001):.1f} tokens/s"
138
+ }
139
+ return json.dumps(formatted, indent=2, ensure_ascii=False)
140
 
141
+ en_zh_output = format_result(results.get("English-to-Chinese", {}))
142
+ zh_en_output = format_result(results.get("Chinese-to-English", {}))
143
+
144
+ # 假设有第三个模型(如果 MODEL_CONFIGS 扩展了)
145
+ # 如果没有,这将是空字符串
146
+ third_model_key = list(MODEL_CONFIGS.keys())[2] if len(MODEL_CONFIGS) > 2 else None
147
+ third_output = format_result(results.get(third_model_key, {})) if third_model_key else ""
148
+
149
+
150
+ # 动态调整返回的输出数量
151
+ if len(MODEL_CONFIGS) == 2:
152
+ return en_zh_output, zh_en_output
153
+ else: # 假设有3个模型
154
+ return en_zh_output, zh_en_output, third_output
155
+
156
+
157
+ def calculate_grace_scores_for_translation():
158
+ """为翻译任务计算GRACE评估分数"""
159
+ grace_data = {
160
+ "English-to-Chinese": {
161
+ "Generalization": 8.0, # 处理不同领域英翻中能力
162
+ "Relevance": 8.5, # 翻译内容与原文语义相关性
163
+ "Accuracy": 8.2, # 翻译精确性
164
+ "Consistency": 8.0, # 翻译稳定性
165
+ "Efficiency": 7.5 # 推理效率
166
+ },
167
+ "Chinese-to-English": {
168
+ "Generalization": 7.8, # 处理不同领域中翻英能力
169
+ "Relevance": 8.3, # 翻译内容与原文语义相关性
170
+ "Accuracy": 8.0, # 翻译精确性
171
+ "Consistency": 7.9, # 翻译稳定性
172
+ "Efficiency": 7.5 # 推理效率
173
+ }
174
+ }
175
+ # 如果有第三个模型,可以添加其分数
176
+ # "Another-Translation-Model": {
177
+ # "Generalization": ..., "Relevance": ..., "Accuracy": ..., "Consistency": ..., "Efficiency": ...
178
+ # }
179
+ return grace_data
180
+
181
+
182
+ def create_translation_radar_chart():
183
+ """创建翻译GRACE评估雷达图"""
184
+ grace_scores = calculate_grace_scores_for_translation()
185
+ categories = ['Generalization', 'Relevance', 'Accuracy', 'Consistency', 'Efficiency'] # 更改为翻译维度
186
+
187
+ fig = go.Figure()
188
+
189
+ for i, (model_name, scores) in enumerate(grace_scores.items()):
190
+ values = [scores[cat] for cat in categories]
191
+ color = MODEL_CONFIGS[model_name]["color"]
192
+
193
+ fig.add_trace(go.Scatterpolar(
194
+ r=values,
195
+ theta=categories,
196
+ fill='toself',
197
+ name=model_name,
198
+ line_color=color,
199
+ fillcolor=color,
200
+ opacity=0.6
201
+ ))
202
+
203
+ fig.update_layout(
204
+ polar=dict(
205
+ radialaxis=dict(
206
+ visible=True,
207
+ range=[0, 10],
208
+ tickfont=dict(size=10)
209
+ )
210
  ),
211
+ showlegend=True,
212
+ title={
213
+ 'text': "GRACE框架:翻译模型评估",
214
+ 'x': 0.5,
215
+ 'font': {'size': 16}
216
+ },
217
+ width=600,
218
+ height=500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  )
220
+ return fig
221
+
222
+ def create_performance_bar_chart():
223
+ """创建性能对比柱状图"""
224
+ grace_scores = calculate_grace_scores_for_translation()
225
+ models = list(grace_scores.keys())
226
+ categories = ['Generalization', 'Relevance', 'Accuracy', 'Consistency', 'Efficiency'] # 更改为翻译维度
227
+
228
+ fig = go.Figure()
229
+ colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#F7DC6F', '#BB8FCE']
230
+
231
+ for i, category in enumerate(categories):
232
+ values = [grace_scores[model][category] for model in models]
233
+ fig.add_trace(go.Bar(
234
+ name=category,
235
+ x=models,
236
+ y=values,
237
+ marker_color=colors[i % len(colors)],
238
+ opacity=0.8
239
+ ))
240
+
241
+ fig.update_layout(
242
+ title='GRACE框架详细对比 - 翻译',
243
+ xaxis_title='模型',
244
+ yaxis_title='分数 (0-10)',
245
+ barmode='group',
246
+ width=700,
247
+ height=400
248
+ )
249
+ return fig
250
+
251
+ def create_model_info_table():
252
+ """创建模型信息对比表"""
253
+ model_info = []
254
+ for model_key, config in MODEL_CONFIGS.items():
255
+ # 模拟参数信息 (Helsinki-NLP OPUS-MT 模型通常较小)
256
+ params = "~3亿" if "en-zh" in config["model_name"] else "~3亿"
257
+ size = "~1.2GB" if "en-zh" in config["model_name"] else "~1.2GB"
258
+
259
+ model_info.append({
260
+ "模型": model_key,
261
+ "参数量": params,
262
+ "模型大小": size,
263
+ "描述": config["description"],
264
+ "最大输出长度": config["max_length"]
265
+ })
266
+ return pd.DataFrame(model_info)
267
+
268
+ def create_summary_scores_table():
269
+ """创建评分摘要表"""
270
+ grace_scores = calculate_grace_scores_for_translation()
271
+ summary_data = []
272
+ for model_name, scores in grace_scores.items():
273
+ avg_score = np.mean(list(scores.values()))
274
+ summary_data.append({
275
+ "模型": model_name,
276
+ "泛化性": scores["Generalization"],
277
+ "相关性": scores["Relevance"],
278
+ "准确性": scores["Accuracy"], # 更改为准确性
279
+ "一致性": scores["Consistency"],
280
+ "效率性": scores["Efficiency"],
281
+ "平均分": round(avg_score, 2)
282
+ })
283
+ df = pd.DataFrame(summary_data)
284
+ return df
285
 
286
+ # 预设的示例提示(英文和中文)
287
+ EXAMPLE_EN_PROMPTS = [
288
+ "Hello, how are you today?",
289
+ "The quick brown fox jumps over the lazy dog.",
290
+ "Artificial intelligence is transforming many industries."
291
+ ]
292
 
293
+ EXAMPLE_ZH_PROMPTS = [
294
+ "你好,今天过得怎么样?",
295
+ "敏捷的棕色狐狸跳过懒惰的狗。",
296
+ "人工智能正在改变许多行业。"
297
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
+ def create_app():
300
+ with gr.Blocks(title="翻译模型对比", theme=gr.themes.Soft()) as app:
301
+ gr.Markdown("# 🌐 翻译模型对比竞技场")
302
+ gr.Markdown("### 使用GRACE框架对比不同翻译模型在翻译任务中的表现")
303
+
304
+ with gr.Tabs():
305
+ # Arena选项卡
306
+ with gr.TabItem("️ 翻译竞技场"):
307
+ gr.Markdown("## 翻译竞技场")
308
+ gr.Markdown("请在下方输入需要翻译的文本(英文或中文),查看不同模型的翻译效果。")
309
+
310
+ with gr.Row():
311
+ with gr.Column(scale=1):
312
+ input_en_prompt = gr.Textbox(
313
+ label="输入英文文本",
314
+ placeholder="Enter your English text here...",
315
+ lines=3,
316
+ value=EXAMPLE_EN_PROMPTS[0]
317
+ )
318
+ # 预设英文示例按钮
319
+ with gr.Row():
320
+ for i, example in enumerate(EXAMPLE_EN_PROMPTS):
321
+ gr.Button(f"英文示例 {i+1}", size="sm").click(
322
+ fn=lambda x=example: x,
323
+ outputs=[input_en_prompt]
324
+ )
325
+
326
+ with gr.Column(scale=1):
327
+ input_zh_prompt = gr.Textbox(
328
+ label="输入中文文本",
329
+ placeholder="在此输入您的中文文本...",
330
+ lines=3,
331
+ value=EXAMPLE_ZH_PROMPTS[0]
332
+ )
333
+ # 预设中文示例按钮
334
+ with gr.Row():
335
+ for i, example in enumerate(EXAMPLE_ZH_PROMPTS):
336
+ gr.Button(f"中文示例 {i+1}", size="sm").click(
337
+ fn=lambda x=example: x,
338
+ outputs=[input_zh_prompt]
339
+ )
340
+
341
+ with gr.Column(scale=1):
342
+ max_length = gr.Slider(
343
+ minimum=50,
344
+ maximum=500,
345
+ value=200,
346
+ step=10,
347
+ label="最大输出Token数"
348
+ )
349
+
350
+ submit_btn = gr.Button(" 开始翻译", variant="primary", size="lg")
351
 
352
+ # 动态创建输出框
353
+ output_boxes = []
354
+ for model_key, config in MODEL_CONFIGS.items():
355
+ output_boxes.append(gr.Code(
356
+ label=f"{model_key} ({config['description'].split('(')[0].strip()})",
357
+ language="json",
358
+ value="点击“开始翻译”查看结果"
359
+ ))
360
+
361
+ submit_btn.click(
362
+ fn=run_translation_comparison,
363
+ inputs=[input_en_prompt, input_zh_prompt, max_length],
364
+ outputs=output_boxes
365
+ )
366
 
367
+ # Benchmark选项卡
368
+ with gr.TabItem(" GRACE 基准测试"):
369
+ gr.Markdown("## GRACE框架对翻译的评估")
370
+ gr.Markdown("""
371
+ **GRACE框架在翻译中的维度定义:**
372
+ - **G**eneralization (泛化性): 模型处理不同领域、风格和复杂度的文本并进行准确翻译的能力。
373
+ - **R**elevance (相关性): 翻译内容在语义和上下文上与原文的匹配程度。
374
+ - **A**ccuracy (准确性): 翻译的精确性和��误性,包括语法、词汇和句法结构的正确性。
375
+ - **C**onsistency (一致性): 对相同或类似输入文本在不同时间或不同上下文中的翻译稳定性。
376
+ - **E**fficiency (效率性): 翻译速度和所需的计算资源(如内存和CPU/GPU使用)。
377
+ """)
378
+
379
+ with gr.Row():
380
+ radar_plot = gr.Plot(
381
+ value=create_translation_radar_chart(),
382
+ label="GRACE 雷达图"
383
+ )
384
+
385
+ with gr.Row():
386
+ bar_plot = gr.Plot(
387
+ value=create_performance_bar_chart(),
388
+ label="详细性能对比"
389
+ )
390
+
391
+ with gr.Row():
392
+ with gr.Column():
393
+ model_info_df = create_model_info_table()
394
+ model_info_table = gr.Dataframe(
395
+ value=model_info_df,
396
+ label="模型信息",
397
+ interactive=False
398
+ )
399
+
400
+ with gr.Column():
401
+ summary_df = create_summary_scores_table()
402
+ summary_table = gr.Dataframe(
403
+ value=summary_df,
404
+ label="GRACE 评分摘要",
405
+ interactive=False
406
+ )
407
+
408
+ return app
409
+
410
+ # 创建并启动 Gradio 应用
411
+ if __name__ == "__main__":
412
+ app = create_app()
413
+ app.launch()