""" 独立验证 VLM Processor + 图像加载 不依赖完整 veRL 环境 """ import pandas as pd from pathlib import Path from PIL import Image print("=" * 50) print("多模态 VLM 验证脚本") print("=" * 50) # Step 1: 加载数据 print("\n[1] 加载 Parquet 数据...") df = pd.read_parquet('data/train/physics_vlm/metaphyx_all.parquet') print(f" 总行数: {len(df)}") print(f" 列名: {df.columns.tolist()}") print(f" image_path 示例: {df['image_path'].iloc[0]}") # Step 2: 加载图像 print("\n[2] 加载示例图像...") image_dir = Path("d:/Research/Rl4Phyx/MetaPhyX/data/images") image_path = image_dir / df['image_path'].iloc[0] print(f" 完整路径: {image_path}") print(f" 文件存在: {image_path.exists()}") image = Image.open(image_path).convert('RGB') print(f" 图像尺寸: {image.size}") # Step 3: 加载 VLM Processor print("\n[3] 加载 Qwen2.5-VL Processor...") from transformers import AutoProcessor processor = AutoProcessor.from_pretrained('Qwen/Qwen2.5-VL-7B-Instruct', trust_remote_code=True) print(f" Processor 类型: {type(processor).__name__}") # Step 4: 构建多模态消息 print("\n[4] 构建多模态消息...") prompt = df['prompt'].iloc[0] text_content = prompt[0]['content'] if isinstance(prompt, list) else str(prompt) print(f" 文本长度: {len(text_content)}") messages = [{ "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": text_content} ] }] # Step 5: 使用 Processor 编码 print("\n[5] 使用 Processor 编码...") try: text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = processor( text=[text], images=[image], return_tensors="pt", padding=True ) print(f" input_ids shape: {inputs['input_ids'].shape}") print(f" attention_mask shape: {inputs['attention_mask'].shape}") if 'pixel_values' in inputs: print(f" pixel_values shape: {inputs['pixel_values'].shape}") print("\n✅ 多模态编码成功!") else: print("\n❌ 没有 pixel_values 输出") except Exception as e: print(f"\n❌ 编码失败: {e}") import traceback traceback.print_exc() print("\n" + "=" * 50)