File size: 1,390 Bytes
f81a1f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import gradio as gr
import json
from eval_qwen3_vl import run_evaluation
from config import OUTPUT_PATH

def start_evaluation():
    """启动评测,返回准确率和结果"""
    try:
        acc, results = run_evaluation()
        # 提取前10题展示(避免界面太长)
        sample_results = results[:10] if len(results) > 10 else results
        return (
            f"✅ 评测完成!总准确率:{acc:.2%}\n"
            f"📊 共评测 {len(results)-1} 题(VisuLogic 1000题)\n"
            f"📁 完整结果已保存到:{OUTPUT_PATH}",
            json.dumps(sample_results, ensure_ascii=False, indent=2)
        )
    except Exception as e:
        return f"❌ 评测出错:{str(e)}", ""

# 创建Gradio界面
with gr.Blocks(title="Qwen3-VL VisuLogic 评测") as demo:
    gr.Markdown("# Qwen3-VL VisuLogic 评测工具")
    gr.Markdown("### 一键运行 VisuLogic 1000题 视觉逻辑推理评测")
    
    with gr.Row():
        run_btn = gr.Button("🚀 开始评测", size="large")
        acc_output = gr.Textbox(label="评测结果", lines=5)
        sample_output = gr.Textbox(label="前10题详情", lines=10)
    
    # 绑定按钮事件
    run_btn.click(start_evaluation, outputs=[acc_output, sample_output])

# 启动Gradio(Space自动适配端口)
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)