陈文韬 commited on
Commit
f81a1f8
·
1 Parent(s): 7e9488a

first commit

Browse files
Files changed (5) hide show
  1. .gitignore +11 -0
  2. app.py +36 -0
  3. config.py +19 -0
  4. eval_qwen3_vl.py +102 -0
  5. requirements.txt +19 -0
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 缓存/日志
2
+ cache/
3
+ *.log
4
+ __pycache__/
5
+ *.pyc
6
+
7
+ # 结果文件(可选:如果想保留结果,删掉这行)
8
+ results/
9
+
10
+ # 环境变量
11
+ .env
app.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from eval_qwen3_vl import run_evaluation
4
+ from config import OUTPUT_PATH
5
+
6
+ def start_evaluation():
7
+ """启动评测,返回准确率和结果"""
8
+ try:
9
+ acc, results = run_evaluation()
10
+ # 提取前10题展示(避免界面太长)
11
+ sample_results = results[:10] if len(results) > 10 else results
12
+ return (
13
+ f"✅ 评测完成!总准确率:{acc:.2%}\n"
14
+ f"📊 共评测 {len(results)-1} 题(VisuLogic 1000题)\n"
15
+ f"📁 完整结果已保存到:{OUTPUT_PATH}",
16
+ json.dumps(sample_results, ensure_ascii=False, indent=2)
17
+ )
18
+ except Exception as e:
19
+ return f"❌ 评测出错:{str(e)}", ""
20
+
21
+ # 创建Gradio界面
22
+ with gr.Blocks(title="Qwen3-VL VisuLogic 评测") as demo:
23
+ gr.Markdown("# Qwen3-VL VisuLogic 评测工具")
24
+ gr.Markdown("### 一键运行 VisuLogic 1000题 视觉逻辑推理评测")
25
+
26
+ with gr.Row():
27
+ run_btn = gr.Button("🚀 开始评测", size="large")
28
+ acc_output = gr.Textbox(label="评测结果", lines=5)
29
+ sample_output = gr.Textbox(label="前10题详情", lines=10)
30
+
31
+ # 绑定按钮事件
32
+ run_btn.click(start_evaluation, outputs=[acc_output, sample_output])
33
+
34
+ # 启动Gradio(Space自动适配端口)
35
+ if __name__ == "__main__":
36
+ demo.launch(server_name="0.0.0.0", server_port=7860)
config.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 配置文件:修改这里的参数即可,不用改核心代码
2
+ import os
3
+ from dotenv import load_dotenv
4
+
5
+ # 加载环境变量(Space的Secrets里配置DASHSCOPE_API_KEY)
6
+ load_dotenv()
7
+
8
+ # 1. API 配置(阿里云DashScope,优先用API,不用本地跑大模型)
9
+ DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
10
+ QWEN_VL_MODEL = "qwen-vl-plus" # 可选:qwen-vl-max/qwen-vl-turbo
11
+
12
+ # 2. 数据集配置
13
+ VISULOGIC_DATASET = "Sellopale/VisuLogic" # HF数据集地址
14
+ DATASET_CACHE_DIR = "./cache/VisuLogic"
15
+
16
+ # 3. 评测配置
17
+ TEMPERATURE = 0.0 # 固定温度,保证结果可复现
18
+ MAX_TOKENS = 10 # 只输出A/B/C/D,不用多token
19
+ OUTPUT_PATH = "./results/qwen3-vl-visulogic.json"
eval_qwen3_vl.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import dashscope
4
+ from dashscope import MultiModalConversation
5
+ from datasets import load_dataset
6
+ from config import *
7
+
8
+ # 初始化DashScope
9
+ dashscope.api_key = DASHSCOPE_API_KEY
10
+
11
+ def load_visulogic_dataset():
12
+ """加载VisuLogic数据集(自动下载到缓存)"""
13
+ dataset = load_dataset(VISULOGIC_DATASET, cache_dir=DATASET_CACHE_DIR)
14
+ return dataset["test"] # VisuLogic的测试集是1000题
15
+
16
+ def qwen3_vl_predict(image_url, question, options):
17
+ """调用Qwen3-VL API预测答案(只返回A/B/C/D)"""
18
+ # 构造prompt:强制只输出选项字母,避免判错
19
+ prompt = f"""
20
+ 视觉逻辑推理题,请严格按照要求回答:
21
+ 问题:{question}
22
+ 选项:
23
+ A: {options[0]}
24
+ B: {options[1]}
25
+ C: {options[2]}
26
+ D: {options[3]}
27
+ 要求:仅输出答案对应的字母(A/B/C/D),不要任何多余文字!
28
+ """
29
+
30
+ # 调用Qwen3-VL多模态API
31
+ messages = [
32
+ {
33
+ "role": "user",
34
+ "content": [
35
+ {"image": image_url}, # 图片URL(数据集里的远程地址)
36
+ {"text": prompt}
37
+ ]
38
+ }
39
+ ]
40
+
41
+ response = MultiModalConversation.call(
42
+ model=QWEN_VL_MODEL,
43
+ messages=messages,
44
+ temperature=TEMPERATURE,
45
+ max_tokens=MAX_TOKENS
46
+ )
47
+
48
+ # 提取答案(处理可能的多余字符)
49
+ answer = response.output.choices[0].message.content.strip().upper()
50
+ # 只保留A/B/C/D,过滤其他字符
51
+ answer = [c for c in answer if c in ["A", "B", "C", "D"]]
52
+ return answer[0] if answer else "A" # 兜底:默认A
53
+
54
+ def run_evaluation():
55
+ """运行完整评测:加载数据→预测→计算准确率→保存结果"""
56
+ # 1. 加载数据集
57
+ dataset = load_visulogic_dataset()
58
+ results = []
59
+ total = len(dataset)
60
+ correct = 0
61
+
62
+ # 2. 批量预测(可加进度条,这里简化)
63
+ for idx, sample in enumerate(dataset):
64
+ # 数据集字段:image_url/question/options/answer(标准答案)
65
+ image_url = sample["image_url"]
66
+ question = sample["question"]
67
+ options = [sample["A"], sample["B"], sample["C"], sample["D"]]
68
+ gt_answer = sample["answer"] # 标准答案(A/B/C/D)
69
+
70
+ # 调用模型预测
71
+ pred_answer = qwen3_vl_predict(image_url, question, options)
72
+
73
+ # 统计正确数
74
+ is_correct = (pred_answer == gt_answer)
75
+ if is_correct:
76
+ correct += 1
77
+
78
+ # 保存单题结果
79
+ results.append({
80
+ "idx": idx,
81
+ "image_url": image_url,
82
+ "question": question,
83
+ "options": options,
84
+ "gt_answer": gt_answer,
85
+ "pred_answer": pred_answer,
86
+ "is_correct": is_correct
87
+ })
88
+
89
+ # 打印进度
90
+ if (idx + 1) % 10 == 0:
91
+ print(f"进度:{idx+1}/{total},当前准确率:{correct/(idx+1):.2%}")
92
+
93
+ # 3. 计算总准确率
94
+ total_acc = correct / total
95
+ results.append({"total_accuracy": total_acc})
96
+
97
+ # 4. 保存结果到文件(创建results目录)
98
+ os.makedirs("./results", exist_ok=True)
99
+ with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
100
+ json.dump(results, f, ensure_ascii=False, indent=2)
101
+
102
+ return total_acc, results
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 基础依赖
2
+ gradio>=4.20.0
3
+ python-dotenv>=1.0.1
4
+ requests>=2.31.0
5
+ numpy>=1.26.0
6
+ pandas>=2.1.0
7
+
8
+ # 多模态/模型依赖
9
+ torch>=2.1.0
10
+ transformers>=4.38.0
11
+ accelerate>=0.27.0
12
+ Pillow>=10.2.0
13
+
14
+ # Hugging Face 数据集
15
+ datasets>=2.17.0
16
+ huggingface-hub>=0.20.0
17
+
18
+ # 阿里云DashScope(调用Qwen3-VL API用)
19
+ dashscope>=1.14.0