rl4phyx-backup / ZeroSearch /One-Shot-RLVR /verify_vlm_processor.py
YUNTA88's picture
Upload folder using huggingface_hub
9a71cb6 verified
"""
独立验证 VLM Processor + 图像加载
不依赖完整 veRL 环境
"""
import pandas as pd
from pathlib import Path
from PIL import Image
print("=" * 50)
print("多模态 VLM 验证脚本")
print("=" * 50)
# Step 1: 加载数据
print("\n[1] 加载 Parquet 数据...")
df = pd.read_parquet('data/train/physics_vlm/metaphyx_all.parquet')
print(f" 总行数: {len(df)}")
print(f" 列名: {df.columns.tolist()}")
print(f" image_path 示例: {df['image_path'].iloc[0]}")
# Step 2: 加载图像
print("\n[2] 加载示例图像...")
image_dir = Path("d:/Research/Rl4Phyx/MetaPhyX/data/images")
image_path = image_dir / df['image_path'].iloc[0]
print(f" 完整路径: {image_path}")
print(f" 文件存在: {image_path.exists()}")
image = Image.open(image_path).convert('RGB')
print(f" 图像尺寸: {image.size}")
# Step 3: 加载 VLM Processor
print("\n[3] 加载 Qwen2.5-VL Processor...")
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained('Qwen/Qwen2.5-VL-7B-Instruct', trust_remote_code=True)
print(f" Processor 类型: {type(processor).__name__}")
# Step 4: 构建多模态消息
print("\n[4] 构建多模态消息...")
prompt = df['prompt'].iloc[0]
text_content = prompt[0]['content'] if isinstance(prompt, list) else str(prompt)
print(f" 文本长度: {len(text_content)}")
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": text_content}
]
}]
# Step 5: 使用 Processor 编码
print("\n[5] 使用 Processor 编码...")
try:
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=[image],
return_tensors="pt",
padding=True
)
print(f" input_ids shape: {inputs['input_ids'].shape}")
print(f" attention_mask shape: {inputs['attention_mask'].shape}")
if 'pixel_values' in inputs:
print(f" pixel_values shape: {inputs['pixel_values'].shape}")
print("\n✅ 多模态编码成功!")
else:
print("\n❌ 没有 pixel_values 输出")
except Exception as e:
print(f"\n❌ 编码失败: {e}")
import traceback
traceback.print_exc()
print("\n" + "=" * 50)