xiaomoguhzz commited on
Commit
79af36d
·
verified ·
1 Parent(s): 576713f

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -9228,3 +9228,5 @@ images/images/texture/true/t2i_r1_run_03956_00000.png filter=lfs diff=lfs merge=
9228
  images/images/texture/true/t2i_r1_run_04173_00009.png filter=lfs diff=lfs merge=lfs -text
9229
  images/images/texture/true/t2i_r1_run_04271_00005.png filter=lfs diff=lfs merge=lfs -text
9230
  images/images/texture/true/t2i_r1_run_04307_00003.png filter=lfs diff=lfs merge=lfs -text
 
 
 
9228
  images/images/texture/true/t2i_r1_run_04173_00009.png filter=lfs diff=lfs merge=lfs -text
9229
  images/images/texture/true/t2i_r1_run_04271_00005.png filter=lfs diff=lfs merge=lfs -text
9230
  images/images/texture/true/t2i_r1_run_04307_00003.png filter=lfs diff=lfs merge=lfs -text
9231
+ reflectionbench_demo/detailed_cases_qwen235b.json filter=lfs diff=lfs merge=lfs -text
9232
+ reflectionbench_demo/detailed_cases_qwen.json filter=lfs diff=lfs merge=lfs -text
reflectionbench_demo/app.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ ReflectionBench Case 展示 Gradio 应用 (HuggingFace Spaces 版本 v2)
4
+
5
+ 启动时自动解压 images.zip 和 edited_images.zip
6
+ """
7
+
8
+ import gradio as gr
9
+ import json
10
+ import zipfile
11
+ from pathlib import Path
12
+ from PIL import Image
13
+ import numpy as np
14
+ from typing import Dict, List, Optional
15
+
16
+ # ============================================================================
17
+ # 初始化:解压图像文件
18
+ # ============================================================================
19
+
20
+ CURRENT_DIR = Path(__file__).parent
21
+
22
+ def extract_if_needed():
23
+ """如果 zip 文件存在且未解压,则解压"""
24
+ for zip_name in ["images.zip", "edited_images.zip"]:
25
+ zip_path = CURRENT_DIR / zip_name
26
+ extract_dir = CURRENT_DIR / zip_name.replace(".zip", "")
27
+
28
+ if zip_path.exists() and not extract_dir.exists():
29
+ print(f"Extracting {zip_name}...")
30
+ with zipfile.ZipFile(zip_path, 'r') as zf:
31
+ zf.extractall(CURRENT_DIR)
32
+ print(f" Done: {extract_dir}")
33
+
34
+ # 启动时解压
35
+ extract_if_needed()
36
+
37
+ # ============================================================================
38
+ # 配置
39
+ # ============================================================================
40
+
41
+ EVAL_MODELS = {
42
+ "qwen": "qwen_eval",
43
+ "qwen235b": "qwen235b_eval",
44
+ }
45
+
46
+ BASELINE_MODELS = [
47
+ "qwen_ft_v1_step900",
48
+ "qwen_ft_v3_step730",
49
+ "qwen_ft_v4_step765",
50
+ "qwen_ft_v5_step920",
51
+ "qwen_ft_v6_step910",
52
+ "qwen_ft_v7_step445",
53
+ "qwen_ft_v8_step365",
54
+ "qwen_ft_v8_step795",
55
+ ]
56
+
57
+ COMPARISON_MODELS = [
58
+ "qwen",
59
+ "qwen3vl",
60
+ "qwen3vl_thinking",
61
+ "bagel",
62
+ "omnigen2",
63
+ "omniverifier",
64
+ "unicot",
65
+ "sld",
66
+ "reflect_dit",
67
+ "reflectionflow_qwen8b",
68
+ "thinkgen",
69
+ "reasonedit",
70
+ ]
71
+
72
+ EDITOR_CONFIG = {
73
+ "qwen_ft_v1_step900": "qwen_image_2511",
74
+ "qwen_ft_v3_step730": "qwen_image_2511",
75
+ "qwen_ft_v4_step765": "qwen_image_2511",
76
+ "qwen_ft_v5_step920": "qwen_image_2511",
77
+ "qwen_ft_v6_step910": "qwen_image_2511",
78
+ "qwen_ft_v7_step445": "qwen_image_2511",
79
+ "qwen_ft_v8_step365": "qwen_image_2511",
80
+ "qwen_ft_v8_step795": "qwen_image_2511",
81
+ "qwen": "qwen_image_2511",
82
+ "qwen3vl": "qwen_image_2511",
83
+ "qwen3vl_thinking": "qwen_image_2511",
84
+ "omniverifier": "qwen_image_2511",
85
+ "sld": "qwen_image_2511",
86
+ "bagel": "bagel",
87
+ "omnigen2": "omnigen2",
88
+ "unicot": "unicot",
89
+ "reflect_dit": "reflect_dit",
90
+ "reflectionflow_qwen8b": "reflectionflow",
91
+ "thinkgen": "thinkgen",
92
+ "reasonedit": "reasonedit",
93
+ }
94
+
95
+ CASE_TYPE_NAMES = {
96
+ "type1_answer_wrong": "Type 1: Answer 错误",
97
+ "type2_explanation_wrong": "Type 2: Explanation 错误→编辑失败",
98
+ "type3_edit_better": "Type 3: Edit Prompt 更优",
99
+ }
100
+
101
+
102
+ # ============================================================================
103
+ # 数据加载
104
+ # ============================================================================
105
+
106
+ def load_cases(eval_model_key: str) -> Dict:
107
+ filename = f"detailed_cases_{eval_model_key}.json"
108
+ filepath = CURRENT_DIR / filename
109
+ if filepath.exists():
110
+ with open(filepath, "r", encoding="utf-8") as f:
111
+ return json.load(f)
112
+ return {}
113
+
114
+ _cases_cache = {}
115
+
116
+ def get_cases(eval_model_key: str) -> Dict:
117
+ if eval_model_key not in _cases_cache:
118
+ _cases_cache[eval_model_key] = load_cases(eval_model_key)
119
+ return _cases_cache[eval_model_key]
120
+
121
+
122
+ # ============================================================================
123
+ # 图片处理
124
+ # ============================================================================
125
+
126
+ def load_image_256(path: Path) -> Optional[np.ndarray]:
127
+ if path and path.exists():
128
+ try:
129
+ img = Image.open(path)
130
+ img.thumbnail((256, 256), Image.Resampling.LANCZOS)
131
+ return np.array(img)
132
+ except Exception:
133
+ pass
134
+ return None
135
+
136
+
137
+ def get_bad_image_path(bad_image_rel: str) -> Path:
138
+ return CURRENT_DIR / "images" / bad_image_rel
139
+
140
+
141
+ def get_edited_image_path(bad_image_rel: str, verifier: str, editor: str) -> Path:
142
+ bad_image_path_obj = Path(bad_image_rel)
143
+ filename = f"{editor}_{verifier}_{bad_image_path_obj.stem}{bad_image_path_obj.suffix}"
144
+ return CURRENT_DIR / "edited_images" / filename
145
+
146
+
147
+ # ============================================================================
148
+ # Gradio 回调函数
149
+ # ============================================================================
150
+
151
+ def get_case_list(eval_model_key: str, baseline: str, comparison: str, case_type: str) -> List[str]:
152
+ cases_data = get_cases(eval_model_key)
153
+ if not cases_data:
154
+ return []
155
+ cases = cases_data.get(baseline, {}).get(comparison, {}).get(case_type, [])
156
+ return [f"idx={c['idx']} | {c['category']} | {c['original_prompt'][:40]}..." for c in cases]
157
+
158
+
159
+ def update_case_list(eval_model_key: str, baseline: str, comparison: str, case_type: str):
160
+ cases = get_case_list(eval_model_key, baseline, comparison, case_type)
161
+ return gr.update(choices=cases, value=cases[0] if cases else None)
162
+
163
+
164
+ def get_comparison_choices(eval_model_key: str, baseline: str) -> List[str]:
165
+ cases_data = get_cases(eval_model_key)
166
+ if not cases_data or baseline not in cases_data:
167
+ return COMPARISON_MODELS
168
+ available = []
169
+ for comp in COMPARISON_MODELS:
170
+ comp_data = cases_data.get(baseline, {}).get(comp, {})
171
+ total = sum(len(comp_data.get(t, [])) for t in CASE_TYPE_NAMES.keys())
172
+ if total > 0:
173
+ available.append(comp)
174
+ return available if available else COMPARISON_MODELS
175
+
176
+
177
+ def update_comparison_choices(eval_model_key: str, baseline: str):
178
+ choices = get_comparison_choices(eval_model_key, baseline)
179
+ return gr.update(choices=choices, value=choices[0] if choices else None)
180
+
181
+
182
+ def show_case(eval_model_key: str, baseline: str, comparison: str, case_type: str, case_idx_str: str):
183
+ empty_result = ("请选择 case", "", "", "", "", "", None, None, None)
184
+
185
+ if not case_idx_str:
186
+ return empty_result
187
+
188
+ cases_data = get_cases(eval_model_key)
189
+ if not cases_data:
190
+ return ("数据未加载", "", "", "", "", "", None, None, None)
191
+
192
+ try:
193
+ idx = int(case_idx_str.split("|")[0].replace("idx=", "").strip())
194
+ except (ValueError, IndexError):
195
+ return ("无法解析 idx", "", "", "", "", "", None, None, None)
196
+
197
+ cases = cases_data.get(baseline, {}).get(comparison, {}).get(case_type, [])
198
+ case = next((c for c in cases if c["idx"] == idx), None)
199
+
200
+ if not case:
201
+ return ("未找到 case", "", "", "", "", "", None, None, None)
202
+
203
+ bad_img = load_image_256(get_bad_image_path(case["bad_image"]))
204
+ baseline_editor = case.get("baseline_editor", EDITOR_CONFIG.get(baseline, "qwen_image_2511"))
205
+ comparison_editor = case.get("comparison_editor", EDITOR_CONFIG.get(comparison, "qwen_image_2511"))
206
+
207
+ baseline_img = load_image_256(get_edited_image_path(case["bad_image"], baseline, baseline_editor))
208
+ comparison_img = load_image_256(get_edited_image_path(case["bad_image"], comparison, comparison_editor))
209
+
210
+ info_col = f"""### Case 详情
211
+ - **评估模型**: {eval_model_key.upper()}
212
+ - **类型**: {CASE_TYPE_NAMES.get(case_type, case_type)}
213
+ - **idx**: {case["idx"]} | **类别**: {case["category"]}
214
+ - **Prompt**: *{case["original_prompt"]}*"""
215
+
216
+ gt_col = f"""### Ground Truth
217
+ - **Answer**: `{case["gt_answer"]}`
218
+ - **Explanation**: {case.get("gt_explanation", "N/A")}"""
219
+
220
+ baseline_col = f"""### {baseline} (基准)
221
+ - **Answer**: `{case["baseline_answer"]}` {"✓" if case["baseline_answer_correct"] else "✗"} | **Exp评估**: {"✓" if case["baseline_explanation_correct"] else "✗"}
222
+ - **Explanation**: {case["baseline_explanation"]}
223
+ - **Edit指令**: {case["baseline_edit_prompt"]}
224
+ - **I_Score**: **{case["baseline_i_score"]:.3f}** | Edited Acc: **{case["baseline_edited_acc"]:.3f}**"""
225
+
226
+ comp_col = f"""### {comparison} (对比)
227
+ - **Answer**: `{case["comparison_answer"]}` {"✓" if case["comparison_answer_correct"] else "✗"} | **Exp评估**: {"✓" if case["comparison_explanation_correct"] else "✗"}
228
+ - **Explanation**: {case["comparison_explanation"] if case["comparison_explanation"] else "—"}
229
+ - **Edit指令**: {case["comparison_edit_prompt"] if case["comparison_edit_prompt"] else "—"}
230
+ - **I_Score**: **{case["comparison_i_score"]:.3f}** | Edited Acc: **{case["comparison_edited_acc"]:.3f}**"""
231
+
232
+ i_diff = case["baseline_i_score"] - case["comparison_i_score"]
233
+ ans_adv = "✓" if case["baseline_answer_correct"] and not case["comparison_answer_correct"] else "-"
234
+ exp_adv = "✓" if case["baseline_explanation_correct"] and not case["comparison_explanation_correct"] else "-"
235
+ summary = f"""**对比总结**: Answer({ans_adv}) | Explanation({exp_adv}) | I_Score: {baseline} **{case["baseline_i_score"]:.3f}** vs {comparison} {case["comparison_i_score"]:.3f} = **+{i_diff:.3f}** | Edited Acc: **{case["baseline_edited_acc"]:.3f}** vs {case["comparison_edited_acc"]:.3f}"""
236
+
237
+ img_labels = f"原图 → {baseline} 编辑 ({baseline_editor}) → {comparison} 编辑 ({comparison_editor})"
238
+
239
+ return (info_col, gt_col, baseline_col, comp_col, summary, img_labels, bad_img, baseline_img, comparison_img)
240
+
241
+
242
+ def get_statistics(eval_model_key: str) -> str:
243
+ cases_data = get_cases(eval_model_key)
244
+ if not cases_data:
245
+ return "数据未加载"
246
+
247
+ lines = [f"### 统计摘要 ({eval_model_key.upper()})\n"]
248
+
249
+ for baseline in BASELINE_MODELS:
250
+ if baseline not in cases_data:
251
+ continue
252
+ lines.append(f"\n**{baseline}**:\n")
253
+ lines.append("| 对比模型 | Type1 | Type2 | Type3 | 总计 |")
254
+ lines.append("|----------|-------|-------|-------|------|")
255
+
256
+ for comparison in COMPARISON_MODELS:
257
+ comp_data = cases_data.get(baseline, {}).get(comparison, {})
258
+ t1 = len(comp_data.get("type1_answer_wrong", []))
259
+ t2 = len(comp_data.get("type2_explanation_wrong", []))
260
+ t3 = len(comp_data.get("type3_edit_better", []))
261
+ total = t1 + t2 + t3
262
+ if total > 0:
263
+ lines.append(f"| {comparison} | {t1} | {t2} | {t3} | {total} |")
264
+
265
+ return "\n".join(lines)
266
+
267
+
268
+ # ============================================================================
269
+ # Gradio 界面
270
+ # ============================================================================
271
+
272
+ def create_app():
273
+ with gr.Blocks(title="ReflectionBench Case Viewer", theme=gr.themes.Soft()) as demo:
274
+ gr.Markdown("# ReflectionBench Case 展示")
275
+ gr.Markdown("对比 Reflector (基准模型) 与其他 baseline 模型的表现差异")
276
+
277
+ with gr.Accordion("统计摘要", open=False):
278
+ stats_md = gr.Markdown()
279
+
280
+ with gr.Row():
281
+ eval_model = gr.Radio(choices=list(EVAL_MODELS.keys()), label="评估模型", value="qwen", scale=1)
282
+ baseline = gr.Dropdown(choices=BASELINE_MODELS, label="基准模型 (Reflector)", value=BASELINE_MODELS[0], scale=1)
283
+ comparison = gr.Dropdown(choices=COMPARISON_MODELS, label="对比模型", value=COMPARISON_MODELS[0], scale=1)
284
+
285
+ with gr.Row():
286
+ case_type = gr.Radio(
287
+ choices=[
288
+ ("Type1: Answer错误", "type1_answer_wrong"),
289
+ ("Type2: Exp错误→编辑失败", "type2_explanation_wrong"),
290
+ ("Type3: Edit更优", "type3_edit_better"),
291
+ ],
292
+ label="Case类型", value="type1_answer_wrong", scale=2
293
+ )
294
+ case_dropdown = gr.Dropdown(choices=[], label="选择Case", scale=2)
295
+
296
+ with gr.Row():
297
+ info_md = gr.Markdown()
298
+ gt_md = gr.Markdown()
299
+
300
+ with gr.Row():
301
+ baseline_md = gr.Markdown()
302
+ comparison_md = gr.Markdown()
303
+
304
+ summary_md = gr.Markdown()
305
+ img_label = gr.Markdown()
306
+
307
+ with gr.Row():
308
+ bad_image = gr.Image(label="原图", scale=1, height=200)
309
+ baseline_edited = gr.Image(label="基准模型编辑后", scale=1, height=200)
310
+ comparison_edited = gr.Image(label="对比模型编辑后", scale=1, height=200)
311
+
312
+ outputs = [info_md, gt_md, baseline_md, comparison_md, summary_md, img_label, bad_image, baseline_edited, comparison_edited]
313
+
314
+ def on_eval_model_change(eval_key, base, comp, ctype):
315
+ stats = get_statistics(eval_key)
316
+ comp_update = update_comparison_choices(eval_key, base)
317
+ case_update = update_case_list(eval_key, base, comp, ctype)
318
+ return stats, comp_update, case_update
319
+
320
+ eval_model.change(fn=on_eval_model_change, inputs=[eval_model, baseline, comparison, case_type], outputs=[stats_md, comparison, case_dropdown])
321
+
322
+ def on_baseline_change(eval_key, base, comp, ctype):
323
+ comp_update = update_comparison_choices(eval_key, base)
324
+ case_update = update_case_list(eval_key, base, comp, ctype)
325
+ return comp_update, case_update
326
+
327
+ baseline.change(fn=on_baseline_change, inputs=[eval_model, baseline, comparison, case_type], outputs=[comparison, case_dropdown])
328
+ comparison.change(fn=update_case_list, inputs=[eval_model, baseline, comparison, case_type], outputs=[case_dropdown])
329
+ case_type.change(fn=update_case_list, inputs=[eval_model, baseline, comparison, case_type], outputs=[case_dropdown])
330
+ case_dropdown.change(fn=show_case, inputs=[eval_model, baseline, comparison, case_type, case_dropdown], outputs=outputs)
331
+
332
+ def on_load(eval_key, base, comp, ctype):
333
+ stats = get_statistics(eval_key)
334
+ cases = get_case_list(eval_key, base, comp, ctype)
335
+ case_val = cases[0] if cases else None
336
+ if case_val:
337
+ case_result = show_case(eval_key, base, comp, ctype, case_val)
338
+ else:
339
+ case_result = ("请选择 case", "", "", "", "", "", None, None, None)
340
+ return (stats, gr.update(choices=cases, value=case_val)) + case_result
341
+
342
+ demo.load(fn=on_load, inputs=[eval_model, baseline, comparison, case_type], outputs=[stats_md, case_dropdown] + outputs)
343
+
344
+ return demo
345
+
346
+
347
+ if __name__ == "__main__":
348
+ demo = create_app()
349
+ demo.launch()
reflectionbench_demo/detailed_cases_qwen.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edb7881740cc0a8b6784aec2b9e1bb7e3d814c6df48edeef23da9276078bb7ba
3
+ size 31125455
reflectionbench_demo/detailed_cases_qwen235b.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0ab8a89ba7b98768f607a9ca9ebe3d369284dd6f5cbe5c0e6dfabc752f8487c
3
+ size 34145044
reflectionbench_demo/edited_images.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b48c25bc80055debf7b7310a7071cba2f2b18ac7fbc63115462b3823bbe07454
3
+ size 8475442818
reflectionbench_demo/images.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21502d845a90717f1d3f19cba06aaadd4272e51acb1dfd1d0120b3c62a8fd44f
3
+ size 681831366
reflectionbench_demo/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=4.0.0
2
+ Pillow>=9.0.0
3
+ numpy>=1.20.0