Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -159,109 +159,16 @@ def init_space_storage() -> None:
|
|
| 159 |
|
| 160 |
init_space_storage()
|
| 161 |
|
| 162 |
-
# Movie-Level 指标定义
|
| 163 |
MOVIE_CRITERIA: List[Tuple[str, str, str]] = [
|
| 164 |
-
("
|
| 165 |
-
("
|
| 166 |
-
("
|
| 167 |
-
("
|
| 168 |
-
("
|
| 169 |
-
("
|
| 170 |
-
("CT", "电影技巧", "镜头运动、景深控制及构图的专业性。"),
|
| 171 |
-
("AVR", "视听丰富度", "画面细节精细度以及音频层次(音效、氛围音)的丰富程度。"),
|
| 172 |
-
("NP", "叙事节奏", "镜头剪辑长短切换是否契合故事情节张力需求。"),
|
| 173 |
-
("VAC", "视听协调性", "画面动作与音效、音乐卡点的同步率。"),
|
| 174 |
-
("CD", "引人入胜程度", "吸引注意力并引发情感共鸣或沉浸感的能力。"),
|
| 175 |
-
("OQ", "整体质量", "对生成视频作为“电影作品”的综合观感评分。"),
|
| 176 |
]
|
| 177 |
|
| 178 |
-
METRIC_SCORING_STANDARDS: Dict[str, str] = {
|
| 179 |
-
"SF": (
|
| 180 |
-
"- **1分:严重偏离原始剧本**:>= 50% 的关键场景缺失或被替换,两个及以上主要角色属性被改动,且有三个及以上情节与原作矛盾。\n"
|
| 181 |
-
"- **2分:部分遵循原始剧本**:保留的关键场景少于 50%,角色设定有 1-2 处重大不一致,且至少两处偏离核心剧情事件。\n"
|
| 182 |
-
"- **3分:总体遵循原始剧本**:>= 70% 的关键场景被保留,角色设定基本一致,仅有不影响主线的次要偏差。\n"
|
| 183 |
-
"- **4分:高度忠实原始剧本**:>= 90% 的关键场景被准确呈现,主要角色设定均被保留,仅有轻微删减且不构成剧情冲突。\n"
|
| 184 |
-
"- **5分:完全忠实原始剧本**:所有关键场景、角色设定与相关细节均正确复现,无可检测偏差。"
|
| 185 |
-
),
|
| 186 |
-
"NC": (
|
| 187 |
-
"- **1分:叙事混乱无序**:存在 >= 3 个重大逻辑问题(如因果错误、时间矛盾、角色行为前后冲突、剧情断裂),导致无法连贯理解。\n"
|
| 188 |
-
"- **2分:基本可懂但缺陷明显**:有 >= 2 处清晰逻辑断点或过渡缺失,明显破坏叙事逻辑。\n"
|
| 189 |
-
"- **3分:总体连贯**:主线清晰,可能有 1 处轻微逻辑不足(如动机铺垫偏弱),但不影响整体理解。\n"
|
| 190 |
-
"- **4分:流畅连贯**:情节推进自然,因果关系清楚,仅有可忽略的逻辑瑕疵。\n"
|
| 191 |
-
"- **5分:完全连贯**:剧情发展自然且论证充分,无逻辑漏洞,所有因果关系清晰明确。"
|
| 192 |
-
),
|
| 193 |
-
"VQ": (
|
| 194 |
-
"- **1分:画面严重损坏**:出现多个关键失败(>= 3),如目标缺失、严重畸变、破帧,关键元素难以识别。\n"
|
| 195 |
-
"- **2分:明显视觉缺陷**:至少两处场景存在元素缺失或畸变,伪影明显干扰观看。\n"
|
| 196 |
-
"- **3分:画面基本完整**:核心元素齐全,偶发轻微错误或短暂伪影,不影响理解。\n"
|
| 197 |
-
"- **4分:画面清晰完整**:仅有极少轻微瑕疵,无明显缺失或严重畸变。\n"
|
| 198 |
-
"- **5分:画面无可挑剔**:所有元素始终正确呈现,无可见畸变或伪影。"
|
| 199 |
-
),
|
| 200 |
-
"CC": (
|
| 201 |
-
"- **1分:角色设计严重不一致**:跨场景有 >= 2 项主要外观属性(如脸型、发型、服饰)变化,同一角色可能像不同人。\n"
|
| 202 |
-
"- **2分:角色波动明显**:多个场景中角色特征变化明显,虽可辨认身份但一致性较差。\n"
|
| 203 |
-
"- **3分:角色总体一致**:外观基本稳定,仅在仔细观察时可见少量轻微不一致。\n"
|
| 204 |
-
"- **4分:角色高度一致**:几乎所有场景与角度下特征稳定,个别差异可忽略。\n"
|
| 205 |
-
"- **5分:角色完全一致**:所有场景与动作下角色特征精准保持,无可见波动。"
|
| 206 |
-
),
|
| 207 |
-
"PLC": (
|
| 208 |
-
"- **1分:严重违反物理规律**:存在 >= 3 处极端违背(不可能运动、重力错误、碰撞失真),真实感崩坏。\n"
|
| 209 |
-
"- **2分:多处违反物理规律**:至少两处明显物理错误,动作或效果显著不真实。\n"
|
| 210 |
-
"- **3分:总体符合物理规��**:大多数运动符合预期,部分动作稍显生硬但可接受。\n"
|
| 211 |
-
"- **4分:物理符合度较好**:运动自然、交互可信,仅有极少偏差。\n"
|
| 212 |
-
"- **5分:物理完全符合**:运动、碰撞与效果均符合现实规律,无异常。"
|
| 213 |
-
),
|
| 214 |
-
"V_AQ": (
|
| 215 |
-
"- **1分:音频极差**:人声不清或缺失,音效混乱或严重失真,影响内容理解。\n"
|
| 216 |
-
"- **2分:音频较差**:人声偶尔不清晰,音效较少或同步较差,明显低于可用标准。\n"
|
| 217 |
-
"- **3分:音频中等**:人声总体清楚,音效匹配基本合适,但精细度一般。\n"
|
| 218 |
-
"- **4分:音频良好**:人声清晰、混音良好,音效丰富且有效支撑场景。\n"
|
| 219 |
-
"- **5分:音频优秀**:人声清晰且富有表现力,声音设计细腻、同步精准,无明显缺陷。"
|
| 220 |
-
),
|
| 221 |
-
"CT": (
|
| 222 |
-
"- **1分:镜头单一僵硬**:构图与景别重复、静止,几乎无目的性电影语言。\n"
|
| 223 |
-
"- **2分:镜头变化有限**:有少量镜头类型但运镜生硬,电影语言使用不稳定或效果弱。\n"
|
| 224 |
-
"- **3分:常见技巧使用尚可**:近景/中景/远景等基本镜头具备,运镜总体平稳但风格不突出。\n"
|
| 225 |
-
"- **4分:电影语言丰富**:镜头类型多样且有意图,运镜自然并能增强叙事或情绪。\n"
|
| 226 |
-
"- **5分:技巧高度创造且精准**:镜头设计丰富有创意,运镜控制精准,电影语言表达力强且目的明确。"
|
| 227 |
-
),
|
| 228 |
-
"AVR": (
|
| 229 |
-
"- **1分:视听表达极其有限**:视觉与声音元素单调重复,变化和层次极少。\n"
|
| 230 |
-
"- **2分:表达基础且程式化**:虽有表达尝试,但形式简单可预测,风格多样性不足。\n"
|
| 231 |
-
"- **3分:多样性中等**:部分场景在风格或节奏上有变化,但整体丰富度不均衡、统一性不足。\n"
|
| 232 |
-
"- **4分:视听表达较强**:多种技法协同,形成层次、情绪转折或风格细节。\n"
|
| 233 |
-
"- **5分:视听语言极其丰富**:声音与画面运用多样且富创造力,形成鲜明艺术风格并带来强叙事/情感冲击。"
|
| 234 |
-
),
|
| 235 |
-
"NP": (
|
| 236 |
-
"- **1分:节奏完全失控**:出现 >= 3 处极端问题(突兀跳切、过长停滞、关键事件过快),严重影响理解。\n"
|
| 237 |
-
"- **2分:节奏明显不稳**:至少两处明显节奏失衡(过赶或拖沓),破坏整体韵律。\n"
|
| 238 |
-
"- **3分:节奏总体合适**:推进基本合理,个别场景略快/略慢但不影响理解。\n"
|
| 239 |
-
"- **4分:节奏控制良好**:时长与转场自然,张弛平衡较佳。\n"
|
| 240 |
-
"- **5分:节奏控制精准**:时间分配有明确意图,显著增强情绪张力与叙事清晰度,快慢切换顺畅。"
|
| 241 |
-
),
|
| 242 |
-
"VAC": (
|
| 243 |
-
"- **1分:视听严重不同步**:持续音画错位,多次口型偏差(多帧)与声画动作不匹配,显著影响观看。\n"
|
| 244 |
-
"- **2分:同步问题明显**:反复出现口型或时间点错位,语音与画面配合较差。\n"
|
| 245 |
-
"- **3分:基本同步**:大多数片段音画对齐,偶有轻微错位但不妨碍观看。\n"
|
| 246 |
-
"- **4分:协调性良好**:语音、音效与画面整体匹配,错误较少且影响很小。\n"
|
| 247 |
-
"- **5分:完美同步**:所有声音元素与画面动作、口型精准对应,整体体验和谐。"
|
| 248 |
-
),
|
| 249 |
-
"CD": (
|
| 250 |
-
"- **1分:毫无吸引力**:难以让观众沉浸或产生情感连接,内容缺乏参与感。\n"
|
| 251 |
-
"- **2分:吸引力不足**:情绪表达较弱,难以持续抓住观众注意力。\n"
|
| 252 |
-
"- **3分:有基础吸引力**:能引发一定兴趣,但情感深度不足,难形成强共鸣。\n"
|
| 253 |
-
"- **4分:吸引力较强**:情绪表达有效,能产生明确情绪反应并维持观看兴趣。\n"
|
| 254 |
-
"- **5分:极具感染力**:情绪张力与参与度很强,观众高度沉浸并产生强烈共鸣。"
|
| 255 |
-
),
|
| 256 |
-
"OQ": (
|
| 257 |
-
"- **1分:整体质量极差**:>= 3 个核心维度严重不足,明显影响理解与观看价值。\n"
|
| 258 |
-
"- **2分:整体质量较差**:至少两个主要维度低于可接受标准,观看价值有限。\n"
|
| 259 |
-
"- **3分:整体质量中等**:多数维度达到一般或可接受水平,优缺点相对平衡,具备基础观看价值。\n"
|
| 260 |
-
"- **4分:整体质量良好**:大部分维度表现到位且协同较好,仅有少量问题,观看价值较高。\n"
|
| 261 |
-
"- **5分:整体质量优秀**:主要维度均高水平发挥,表现稳定、协调且具艺术性,观看与审美价值很高。"
|
| 262 |
-
),
|
| 263 |
-
}
|
| 264 |
-
|
| 265 |
BASE_METRIC_KEYS = [k for k, _, _ in MOVIE_CRITERIA]
|
| 266 |
SAVE_LOCK = threading.Lock()
|
| 267 |
|
|
@@ -470,25 +377,48 @@ def sync_results_from_hub_to_local() -> None:
|
|
| 470 |
|
| 471 |
|
| 472 |
def build_pending_samples() -> List[Dict[str, Any]]:
|
| 473 |
-
"""构建样本池
|
| 474 |
all_samples = load_dataset_index()
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 491 |
|
|
|
|
| 492 |
for i, sample in enumerate(pending, start=1):
|
| 493 |
sample["anon_id"] = f"id_{i:03d}"
|
| 494 |
return pending
|
|
@@ -508,37 +438,41 @@ def build_data_diagnostics(samples: List[Dict[str, Any]]) -> str:
|
|
| 508 |
|
| 509 |
|
| 510 |
def compute_derived(scores: Dict[str, float]) -> Dict[str, float]:
|
| 511 |
-
"""计算 CL /
|
| 512 |
-
cl = (
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
+
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 527 |
-
result_dir = OUTPUT_DIR / "raw_results" / sample["
|
| 528 |
result_dir.mkdir(parents=True, exist_ok=True)
|
| 529 |
-
out_path = result_dir / f"{sample['
|
| 530 |
-
|
| 531 |
-
score_float = {k: float(v) for k, v in scores.items()}
|
| 532 |
-
derived = compute_derived(score_float)
|
| 533 |
|
| 534 |
payload = {
|
| 535 |
"timestamp": datetime.now().isoformat(),
|
| 536 |
"evaluator_id": evaluator_id,
|
| 537 |
-
"
|
| 538 |
-
"
|
| 539 |
-
"
|
|
|
|
| 540 |
"summary": summary,
|
| 541 |
-
"derived": derived,
|
| 542 |
}
|
| 543 |
with open(out_path, "w", encoding="utf-8") as f:
|
| 544 |
json.dump(payload, f, ensure_ascii=False, indent=2)
|
|
@@ -548,7 +482,7 @@ def save_single_result(sample: Dict[str, Any], evaluator_id: str, scores: Dict[s
|
|
| 548 |
def recompute_method_aggregates() -> Path:
|
| 549 |
"""
|
| 550 |
统计每个方法各维度均分,并输出 method_aggregates.json。
|
| 551 |
-
同时给出 CL/
|
| 552 |
"""
|
| 553 |
raw_root = OUTPUT_DIR / "raw_results"
|
| 554 |
method_scores: Dict[str, Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list))
|
|
@@ -558,23 +492,21 @@ def recompute_method_aggregates() -> Path:
|
|
| 558 |
for fp in raw_root.rglob("*.json"):
|
| 559 |
with open(fp, "r", encoding="utf-8-sig") as f:
|
| 560 |
data = json.load(f)
|
| 561 |
-
|
| 562 |
-
scores
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
for d_key, d_val in derived.items():
|
| 572 |
-
method_scores[method][d_key].append(float(d_val))
|
| 573 |
|
| 574 |
agg = {
|
| 575 |
"updated_at": datetime.now().isoformat(),
|
| 576 |
"metric_keys": BASE_METRIC_KEYS,
|
| 577 |
-
|
| 578 |
"methods": {},
|
| 579 |
}
|
| 580 |
for method in sorted(method_scores.keys()):
|
|
@@ -629,8 +561,11 @@ def push_result_files_to_hub(single_path: Path, agg_path: Path) -> Optional[str]
|
|
| 629 |
def build_sample_brief_html(sample: Dict[str, Any], index: int, total: int) -> str:
|
| 630 |
story = sample.get("story_text") or "(未找到对应 story 文本,请检查 clip_movie_story 下是否有同名 txt)"
|
| 631 |
safe_story = html.escape(story)
|
|
|
|
|
|
|
| 632 |
return (
|
| 633 |
"<div class='sample-card'>"
|
|
|
|
| 634 |
"<div class='story-title'>剧情描述</div>"
|
| 635 |
f"<p class='story-body'>{safe_story}</p>"
|
| 636 |
"</div>"
|
|
@@ -649,7 +584,7 @@ def create_app():
|
|
| 649 |
"""
|
| 650 |
<div id="hero">
|
| 651 |
<h1>VideoEval · Movie-Level Evaluation</h1>
|
| 652 |
-
<p>统一电影级评测问卷,支持方法级均分统计(含 CL /
|
| 653 |
</div>
|
| 654 |
"""
|
| 655 |
)
|
|
@@ -660,24 +595,26 @@ def create_app():
|
|
| 660 |
with gr.Row():
|
| 661 |
with gr.Column(elem_classes=["panel", "center-panel"]):
|
| 662 |
gr.HTML("<div class='section-head' style='text-align:center;'>1) 视频与剧情</div>")
|
| 663 |
-
|
|
|
|
|
|
|
| 664 |
sample_info = gr.HTML(
|
| 665 |
"<div class='sample-card'><p class='story-body'>无可用样本</p></div>"
|
| 666 |
if not samples else build_sample_brief_html(samples[0], 0, len(samples))
|
| 667 |
)
|
| 668 |
|
| 669 |
status = gr.Markdown("")
|
| 670 |
-
gr.Markdown("## 2) 评分(
|
| 671 |
-
gr.Markdown("<span class='hint'>
|
| 672 |
|
| 673 |
score_widgets: Dict[str, gr.Radio] = {}
|
| 674 |
metric_groups = {
|
| 675 |
-
"I. 叙事与剧本 (NS)": ["
|
| 676 |
-
"II. 视听与技术 (AT)": ["
|
| 677 |
-
"III. 美学与表现力 (AE)": ["
|
| 678 |
-
"IV. 节奏与流动性 (RF)": ["
|
| 679 |
-
"V. 情感与参与度 (EE)": ["
|
| 680 |
-
"VI. 整体体验 (OE)": ["
|
| 681 |
}
|
| 682 |
criteria_map = {k: (name, desc) for k, name, desc in MOVIE_CRITERIA}
|
| 683 |
|
|
@@ -688,11 +625,9 @@ def create_app():
|
|
| 688 |
with gr.Group(elem_classes=["metric-card"]):
|
| 689 |
gr.Markdown(f"**{key} · {name}**")
|
| 690 |
gr.Markdown(f"<span class='hint'>{desc}</span>")
|
| 691 |
-
|
| 692 |
-
gr.Markdown(METRIC_SCORING_STANDARDS[key])
|
| 693 |
-
score_widgets[key] = gr.Radio(choices=[1, 2, 3, 4, 5], label=f"{key} Score")
|
| 694 |
|
| 695 |
-
final_summary = gr.Textbox(label="Final Summary(可选)", lines=4, placeholder="总结
|
| 696 |
submit_btn = gr.Button("提交", variant="primary", elem_id="submit-btn")
|
| 697 |
|
| 698 |
def _submit(summary: str, curr_samples: List[Dict[str, Any]], *score_vals):
|
|
@@ -704,40 +639,46 @@ def create_app():
|
|
| 704 |
sample = curr_samples[0]
|
| 705 |
evaluator_id = "anonymous"
|
| 706 |
|
| 707 |
-
|
| 708 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
for i, key in enumerate(BASE_METRIC_KEYS):
|
| 710 |
raw_score = score_vals[i] if i < len(score_vals) else None
|
| 711 |
|
| 712 |
-
|
| 713 |
-
if raw_score in (None, "", [], 0):
|
| 714 |
msg = f"❌ 请为 `{key}` 打分。"
|
| 715 |
gr.Warning(msg)
|
| 716 |
return "", False
|
| 717 |
-
if isinstance(raw_score, str) and raw_score.strip().lower() in {"none", "null", "[]"
|
| 718 |
msg = f"❌ 请为 `{key}` 打分。"
|
| 719 |
gr.Warning(msg)
|
| 720 |
return "", False
|
| 721 |
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
msg = f"❌ `{key}` 的评分无效,请重新选择 1-5 分。"
|
| 726 |
gr.Warning(msg)
|
| 727 |
return msg, False
|
| 728 |
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
|
|
|
|
|
|
|
|
|
| 736 |
|
| 737 |
with SAVE_LOCK:
|
| 738 |
# 同步远程最新结果,确保“允许重复提交”后平均分统计包含全量提交。
|
| 739 |
sync_results_from_hub_to_local()
|
| 740 |
-
single_path = save_single_result(sample, evaluator_id,
|
| 741 |
agg_path = recompute_method_aggregates()
|
| 742 |
push_err = push_result_files_to_hub(single_path, agg_path)
|
| 743 |
|
|
@@ -749,34 +690,47 @@ def create_app():
|
|
| 749 |
_ = (single_path, agg_path)
|
| 750 |
return "", True
|
| 751 |
|
| 752 |
-
def _refresh_on_load() -> Tuple[Any, str, str, List[Dict[str, Any]]]:
|
| 753 |
refreshed_samples = build_pending_samples()
|
| 754 |
if not refreshed_samples:
|
| 755 |
-
return None, "<div class='sample-card'><p class='story-body'>无可用样本(
|
| 756 |
|
| 757 |
first = refreshed_samples[0]
|
| 758 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 759 |
|
| 760 |
def _refresh_after_submit(
|
| 761 |
submit_ok: bool,
|
| 762 |
submit_msg: str,
|
| 763 |
-
|
|
|
|
| 764 |
curr_info: str,
|
| 765 |
curr_samples: List[Dict[str, Any]],
|
| 766 |
-
) -> Tuple[Any, str, str, List[Dict[str, Any]]]:
|
| 767 |
submit_msg = (submit_msg or "").strip()
|
| 768 |
# 提交失败时,不刷新样本/故事,保持当前页面不变
|
| 769 |
if not submit_ok:
|
| 770 |
-
return
|
| 771 |
|
| 772 |
refreshed_samples = build_pending_samples()
|
| 773 |
if not refreshed_samples:
|
| 774 |
status_msg = submit_msg
|
| 775 |
-
return None, "<div class='sample-card'><p class='story-body'>无可用样本(
|
| 776 |
|
| 777 |
first = refreshed_samples[0]
|
| 778 |
status_msg = submit_msg
|
| 779 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 780 |
|
| 781 |
def _clear_scores_after_submit(submit_ok: bool) -> Tuple[Any, ...]:
|
| 782 |
# 提交失败时不清空输入,便于用户补充后重提
|
|
@@ -801,13 +755,13 @@ def create_app():
|
|
| 801 |
)
|
| 802 |
submit_evt.then(
|
| 803 |
_refresh_after_submit,
|
| 804 |
-
inputs=[submit_ok_state, status,
|
| 805 |
-
outputs=[
|
| 806 |
)
|
| 807 |
|
| 808 |
app.load(
|
| 809 |
_refresh_on_load,
|
| 810 |
-
outputs=[
|
| 811 |
)
|
| 812 |
|
| 813 |
return app
|
|
|
|
| 159 |
|
| 160 |
init_space_storage()
|
| 161 |
|
| 162 |
+
# Movie-Level 指标定义(仅保留六个聚合指标)
|
| 163 |
MOVIE_CRITERIA: List[Tuple[str, str, str]] = [
|
| 164 |
+
("NS", "叙事与剧本", "剧情忠实且连贯"),
|
| 165 |
+
("AT", "视听与技术", "画音质量与一致性"),
|
| 166 |
+
("AE", "美学与表现力", "镜头语言与风格层次"),
|
| 167 |
+
("RF", "节奏与流动性", "叙事节奏与音画衔接"),
|
| 168 |
+
("EE", "情感与参与度", "情绪感染与沉浸感"),
|
| 169 |
+
("OE", "整体体验", "整体观感与完成度"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
]
|
| 171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
BASE_METRIC_KEYS = [k for k, _, _ in MOVIE_CRITERIA]
|
| 173 |
SAVE_LOCK = threading.Lock()
|
| 174 |
|
|
|
|
| 377 |
|
| 378 |
|
| 379 |
def build_pending_samples() -> List[Dict[str, Any]]:
|
| 380 |
+
"""构建对比样本池:同一 story 下不同方法两两配对。"""
|
| 381 |
all_samples = load_dataset_index()
|
| 382 |
+
by_story: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
| 383 |
+
for sample in all_samples:
|
| 384 |
+
by_story[sample["story_name"]].append(sample)
|
| 385 |
+
|
| 386 |
+
pending: List[Dict[str, Any]] = []
|
| 387 |
+
for story_name, story_samples in by_story.items():
|
| 388 |
+
# 同一 story 至少两个方法才能做 A/B 对比
|
| 389 |
+
if len(story_samples) < 2:
|
| 390 |
+
continue
|
| 391 |
+
story_samples = sorted(story_samples, key=lambda x: x["method"])
|
| 392 |
+
for i in range(len(story_samples)):
|
| 393 |
+
for j in range(i + 1, len(story_samples)):
|
| 394 |
+
first = story_samples[i]
|
| 395 |
+
second = story_samples[j]
|
| 396 |
+
# 随机左右位,降低固定左右带来的偏置
|
| 397 |
+
if random.random() < 0.5:
|
| 398 |
+
a_sample, b_sample = first, second
|
| 399 |
+
else:
|
| 400 |
+
a_sample, b_sample = second, first
|
| 401 |
+
pending.append(
|
| 402 |
+
{
|
| 403 |
+
"pair_id": f"{story_name}__{first['method']}_vs_{second['method']}",
|
| 404 |
+
"story_name": story_name,
|
| 405 |
+
"story_text": first.get("story_text", "") or second.get("story_text", ""),
|
| 406 |
+
"A": {
|
| 407 |
+
"method": a_sample["method"],
|
| 408 |
+
"video_name": a_sample["video_name"],
|
| 409 |
+
"video_path": a_sample["video_path"],
|
| 410 |
+
"sample_id": a_sample["sample_id"],
|
| 411 |
+
},
|
| 412 |
+
"B": {
|
| 413 |
+
"method": b_sample["method"],
|
| 414 |
+
"video_name": b_sample["video_name"],
|
| 415 |
+
"video_path": b_sample["video_path"],
|
| 416 |
+
"sample_id": b_sample["sample_id"],
|
| 417 |
+
},
|
| 418 |
+
}
|
| 419 |
+
)
|
| 420 |
|
| 421 |
+
random.shuffle(pending)
|
| 422 |
for i, sample in enumerate(pending, start=1):
|
| 423 |
sample["anon_id"] = f"id_{i:03d}"
|
| 424 |
return pending
|
|
|
|
| 438 |
|
| 439 |
|
| 440 |
def compute_derived(scores: Dict[str, float]) -> Dict[str, float]:
|
| 441 |
+
"""计算 CL / CRH / AVG。"""
|
| 442 |
+
cl = ((2 * scores["NS"] + 3 * scores["AT"]) / 5.0) + 0.5 * scores["AE"]
|
| 443 |
+
crh = ((scores["AT"] + 2 * scores["RF"] + scores["EE"] + scores["OE"]) / 5.0) + 0.5 * scores["AE"]
|
| 444 |
+
avg = (
|
| 445 |
+
2 * scores["NS"]
|
| 446 |
+
+ 4 * scores["AT"]
|
| 447 |
+
+ 2 * scores["AE"]
|
| 448 |
+
+ 2 * scores["RF"]
|
| 449 |
+
+ scores["EE"]
|
| 450 |
+
+ scores["OE"]
|
| 451 |
+
) / 12.0
|
| 452 |
+
return {"CL": cl, "CRH": crh, "AVG": avg}
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
def save_single_result(
|
| 456 |
+
sample: Dict[str, Any],
|
| 457 |
+
evaluator_id: str,
|
| 458 |
+
metric_choice: Dict[str, str],
|
| 459 |
+
method_scores: Dict[str, Dict[str, float]],
|
| 460 |
+
summary: str,
|
| 461 |
+
) -> Path:
|
| 462 |
+
"""保存单个 A/B 对比问卷结果。"""
|
| 463 |
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 464 |
+
result_dir = OUTPUT_DIR / "raw_results" / sample["story_name"]
|
| 465 |
result_dir.mkdir(parents=True, exist_ok=True)
|
| 466 |
+
out_path = result_dir / f"{sample['pair_id']}_{evaluator_id}_{ts}.json"
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
payload = {
|
| 469 |
"timestamp": datetime.now().isoformat(),
|
| 470 |
"evaluator_id": evaluator_id,
|
| 471 |
+
"pair": sample,
|
| 472 |
+
"metric_choice": metric_choice,
|
| 473 |
+
"method_scores": method_scores,
|
| 474 |
+
"method_derived": {m: compute_derived(v) for m, v in method_scores.items()},
|
| 475 |
"summary": summary,
|
|
|
|
| 476 |
}
|
| 477 |
with open(out_path, "w", encoding="utf-8") as f:
|
| 478 |
json.dump(payload, f, ensure_ascii=False, indent=2)
|
|
|
|
| 482 |
def recompute_method_aggregates() -> Path:
|
| 483 |
"""
|
| 484 |
统计每个方法各维度均分,并输出 method_aggregates.json。
|
| 485 |
+
同时给出 CL/CRH/AVG 的方法均值。
|
| 486 |
"""
|
| 487 |
raw_root = OUTPUT_DIR / "raw_results"
|
| 488 |
method_scores: Dict[str, Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list))
|
|
|
|
| 492 |
for fp in raw_root.rglob("*.json"):
|
| 493 |
with open(fp, "r", encoding="utf-8-sig") as f:
|
| 494 |
data = json.load(f)
|
| 495 |
+
pair_method_scores = data.get("method_scores", {})
|
| 496 |
+
for method, scores in pair_method_scores.items():
|
| 497 |
+
if not all(k in scores for k in BASE_METRIC_KEYS):
|
| 498 |
+
continue
|
| 499 |
+
method_count[method] += 1
|
| 500 |
+
for k in BASE_METRIC_KEYS:
|
| 501 |
+
method_scores[method][k].append(float(scores[k]))
|
| 502 |
+
derived = compute_derived({k: float(scores[k]) for k in BASE_METRIC_KEYS})
|
| 503 |
+
for d_key, d_val in derived.items():
|
| 504 |
+
method_scores[method][d_key].append(float(d_val))
|
|
|
|
|
|
|
| 505 |
|
| 506 |
agg = {
|
| 507 |
"updated_at": datetime.now().isoformat(),
|
| 508 |
"metric_keys": BASE_METRIC_KEYS,
|
| 509 |
+
"derived_keys": ["CL", "CRH", "AVG"],
|
| 510 |
"methods": {},
|
| 511 |
}
|
| 512 |
for method in sorted(method_scores.keys()):
|
|
|
|
| 561 |
def build_sample_brief_html(sample: Dict[str, Any], index: int, total: int) -> str:
|
| 562 |
story = sample.get("story_text") or "(未找到对应 story 文本,请检查 clip_movie_story 下是否有同名 txt)"
|
| 563 |
safe_story = html.escape(story)
|
| 564 |
+
a_method = html.escape(sample.get("A", {}).get("method", ""))
|
| 565 |
+
b_method = html.escape(sample.get("B", {}).get("method", ""))
|
| 566 |
return (
|
| 567 |
"<div class='sample-card'>"
|
| 568 |
+
f"<div class='sid'>对比 {index + 1}/{total} · A: {a_method} · B: {b_method}</div>"
|
| 569 |
"<div class='story-title'>剧情描述</div>"
|
| 570 |
f"<p class='story-body'>{safe_story}</p>"
|
| 571 |
"</div>"
|
|
|
|
| 584 |
"""
|
| 585 |
<div id="hero">
|
| 586 |
<h1>VideoEval · Movie-Level Evaluation</h1>
|
| 587 |
+
<p>统一电影级评测问卷,支持方法级均分统计(含 CL / CRH / AVG)</p>
|
| 588 |
</div>
|
| 589 |
"""
|
| 590 |
)
|
|
|
|
| 595 |
with gr.Row():
|
| 596 |
with gr.Column(elem_classes=["panel", "center-panel"]):
|
| 597 |
gr.HTML("<div class='section-head' style='text-align:center;'>1) 视频与剧情</div>")
|
| 598 |
+
with gr.Row():
|
| 599 |
+
video_a = gr.Video(label="A", value=samples[0]["A"]["video_path"] if samples else None, height=360)
|
| 600 |
+
video_b = gr.Video(label="B", value=samples[0]["B"]["video_path"] if samples else None, height=360)
|
| 601 |
sample_info = gr.HTML(
|
| 602 |
"<div class='sample-card'><p class='story-body'>无可用样本</p></div>"
|
| 603 |
if not samples else build_sample_brief_html(samples[0], 0, len(samples))
|
| 604 |
)
|
| 605 |
|
| 606 |
status = gr.Markdown("")
|
| 607 |
+
gr.Markdown("## 2) 对比评分(A好 / B好 / 平手)")
|
| 608 |
+
gr.Markdown("<span class='hint'>每项都必须选择,A好= A得1/B得0,B好反之,平手各0.5。</span>")
|
| 609 |
|
| 610 |
score_widgets: Dict[str, gr.Radio] = {}
|
| 611 |
metric_groups = {
|
| 612 |
+
"I. 叙事与剧本 (NS)": ["NS"],
|
| 613 |
+
"II. 视听与技术 (AT)": ["AT"],
|
| 614 |
+
"III. 美学与表现力 (AE)": ["AE"],
|
| 615 |
+
"IV. 节奏与流动性 (RF)": ["RF"],
|
| 616 |
+
"V. 情感与参与度 (EE)": ["EE"],
|
| 617 |
+
"VI. 整体体验 (OE)": ["OE"],
|
| 618 |
}
|
| 619 |
criteria_map = {k: (name, desc) for k, name, desc in MOVIE_CRITERIA}
|
| 620 |
|
|
|
|
| 625 |
with gr.Group(elem_classes=["metric-card"]):
|
| 626 |
gr.Markdown(f"**{key} · {name}**")
|
| 627 |
gr.Markdown(f"<span class='hint'>{desc}</span>")
|
| 628 |
+
score_widgets[key] = gr.Radio(choices=["A好", "B好", "平手"], label=f"{key} Winner")
|
|
|
|
|
|
|
| 629 |
|
| 630 |
+
final_summary = gr.Textbox(label="Final Summary(可选)", lines=4, placeholder="总结 A/B 的主要优缺点")
|
| 631 |
submit_btn = gr.Button("提交", variant="primary", elem_id="submit-btn")
|
| 632 |
|
| 633 |
def _submit(summary: str, curr_samples: List[Dict[str, Any]], *score_vals):
|
|
|
|
| 639 |
sample = curr_samples[0]
|
| 640 |
evaluator_id = "anonymous"
|
| 641 |
|
| 642 |
+
a_method = sample["A"]["method"]
|
| 643 |
+
b_method = sample["B"]["method"]
|
| 644 |
+
method_scores: Dict[str, Dict[str, float]] = {
|
| 645 |
+
a_method: {k: 0.0 for k in BASE_METRIC_KEYS},
|
| 646 |
+
b_method: {k: 0.0 for k in BASE_METRIC_KEYS},
|
| 647 |
+
}
|
| 648 |
+
metric_choice: Dict[str, str] = {}
|
| 649 |
for i, key in enumerate(BASE_METRIC_KEYS):
|
| 650 |
raw_score = score_vals[i] if i < len(score_vals) else None
|
| 651 |
|
| 652 |
+
if raw_score in (None, "", []):
|
|
|
|
| 653 |
msg = f"❌ 请为 `{key}` 打分。"
|
| 654 |
gr.Warning(msg)
|
| 655 |
return "", False
|
| 656 |
+
if isinstance(raw_score, str) and raw_score.strip().lower() in {"none", "null", "[]"}:
|
| 657 |
msg = f"❌ 请为 `{key}` 打分。"
|
| 658 |
gr.Warning(msg)
|
| 659 |
return "", False
|
| 660 |
|
| 661 |
+
choice = str(raw_score).strip()
|
| 662 |
+
if choice not in {"A好", "B好", "平手"}:
|
| 663 |
+
msg = f"❌ `{key}` 的选择无效,请重新选择 A好/B好/平手。"
|
|
|
|
| 664 |
gr.Warning(msg)
|
| 665 |
return msg, False
|
| 666 |
|
| 667 |
+
metric_choice[key] = choice
|
| 668 |
+
if choice == "A好":
|
| 669 |
+
method_scores[a_method][key] = 1.0
|
| 670 |
+
method_scores[b_method][key] = 0.0
|
| 671 |
+
elif choice == "B好":
|
| 672 |
+
method_scores[a_method][key] = 0.0
|
| 673 |
+
method_scores[b_method][key] = 1.0
|
| 674 |
+
else:
|
| 675 |
+
method_scores[a_method][key] = 0.5
|
| 676 |
+
method_scores[b_method][key] = 0.5
|
| 677 |
|
| 678 |
with SAVE_LOCK:
|
| 679 |
# 同步远程最新结果,确保“允许重复提交”后平均分统计包含全量提交。
|
| 680 |
sync_results_from_hub_to_local()
|
| 681 |
+
single_path = save_single_result(sample, evaluator_id, metric_choice, method_scores, summary or "")
|
| 682 |
agg_path = recompute_method_aggregates()
|
| 683 |
push_err = push_result_files_to_hub(single_path, agg_path)
|
| 684 |
|
|
|
|
| 690 |
_ = (single_path, agg_path)
|
| 691 |
return "", True
|
| 692 |
|
| 693 |
+
def _refresh_on_load() -> Tuple[Any, Any, str, str, List[Dict[str, Any]]]:
|
| 694 |
refreshed_samples = build_pending_samples()
|
| 695 |
if not refreshed_samples:
|
| 696 |
+
return None, None, "<div class='sample-card'><p class='story-body'>无可用样本(需要同剧情下至少两个方法)</p></div>", "", refreshed_samples
|
| 697 |
|
| 698 |
first = refreshed_samples[0]
|
| 699 |
+
return (
|
| 700 |
+
first["A"]["video_path"],
|
| 701 |
+
first["B"]["video_path"],
|
| 702 |
+
build_sample_brief_html(first, 0, len(refreshed_samples)),
|
| 703 |
+
"",
|
| 704 |
+
refreshed_samples,
|
| 705 |
+
)
|
| 706 |
|
| 707 |
def _refresh_after_submit(
|
| 708 |
submit_ok: bool,
|
| 709 |
submit_msg: str,
|
| 710 |
+
curr_video_a: Any,
|
| 711 |
+
curr_video_b: Any,
|
| 712 |
curr_info: str,
|
| 713 |
curr_samples: List[Dict[str, Any]],
|
| 714 |
+
) -> Tuple[Any, Any, str, str, List[Dict[str, Any]]]:
|
| 715 |
submit_msg = (submit_msg or "").strip()
|
| 716 |
# 提交失败时,不刷新样本/故事,保持当前页面不变
|
| 717 |
if not submit_ok:
|
| 718 |
+
return curr_video_a, curr_video_b, curr_info, submit_msg, curr_samples
|
| 719 |
|
| 720 |
refreshed_samples = build_pending_samples()
|
| 721 |
if not refreshed_samples:
|
| 722 |
status_msg = submit_msg
|
| 723 |
+
return None, None, "<div class='sample-card'><p class='story-body'>无可用样本(需要同剧情下至少两个方法)</p></div>", status_msg, refreshed_samples
|
| 724 |
|
| 725 |
first = refreshed_samples[0]
|
| 726 |
status_msg = submit_msg
|
| 727 |
+
return (
|
| 728 |
+
first["A"]["video_path"],
|
| 729 |
+
first["B"]["video_path"],
|
| 730 |
+
build_sample_brief_html(first, 0, len(refreshed_samples)),
|
| 731 |
+
status_msg,
|
| 732 |
+
refreshed_samples,
|
| 733 |
+
)
|
| 734 |
|
| 735 |
def _clear_scores_after_submit(submit_ok: bool) -> Tuple[Any, ...]:
|
| 736 |
# 提交失败时不清空输入,便于用户补充后重提
|
|
|
|
| 755 |
)
|
| 756 |
submit_evt.then(
|
| 757 |
_refresh_after_submit,
|
| 758 |
+
inputs=[submit_ok_state, status, video_a, video_b, sample_info, samples_state],
|
| 759 |
+
outputs=[video_a, video_b, sample_info, status, samples_state],
|
| 760 |
)
|
| 761 |
|
| 762 |
app.load(
|
| 763 |
_refresh_on_load,
|
| 764 |
+
outputs=[video_a, video_b, sample_info, status, samples_state],
|
| 765 |
)
|
| 766 |
|
| 767 |
return app
|