RoboMind VLA: vision-language reward model for robot locomotion (built with Codex)

321ba64 verified 20 days ago

11.4 kB

	"""
	RoboMind VLA — Task 12: tests.py

	End-to-end tests and verification for the entire pipeline.
	Runs on Modal CPU for data checks, GPU for model checks.

	Usage:
	modal run tests.py
	"""

	from __future__ import annotations

	import json

	import modal

	image = (
	modal.Image.debian_slim(python_version="3.11")
	.pip_install(
	"numpy<2",
	"torch==2.4.0",
	"torchvision==0.19.0",
	"transformers==4.40.0",
	"peft==0.11.1",
	"accelerate==0.30.1",
	"datasets",
	"pillow",
	"huggingface_hub",
	)
	.run_commands(
	"python -c \""
	"import os, sys; "
	"d = os.path.join(sys.prefix, 'lib/python3.11/site-packages/flash_attn'); "
	"os.makedirs(d, exist_ok=True); "
	"open(os.path.join(d, '__init__.py'), 'w').write(''); "
	"open(os.path.join(d, 'flash_attn_interface.py'), 'w').write("
	"'def flash_attn_func(a, *kw): raise NotImplementedError\\n"
	"def flash_attn_varlen_func(a, *kw): raise NotImplementedError\\n'); "
	"print('flash_attn stub created')\""
	)
	)

	app = modal.App("robomind-tests")
	volume = modal.Volume.from_name("robomind-data", create_if_missing=True)


	@app.function(image=image, secrets=[modal.Secret.from_name("huggingface-secret")], timeout=300)
	def test_data_repos():
	"""Test 1: Verify HF data repos exist and have correct structure."""
	from huggingface_hub import HfApi, hf_hub_download
	import os
	from huggingface_hub import login

	hf_token = os.environ.get("HF_TOKEN")
	if hf_token:
	login(token=hf_token)

	api = HfApi()
	results = {}

	# Check robomind-rollouts
	try:
	files = api.list_repo_files("mitvho09/robomind-rollouts")
	mp4_count = len([f for f in files if f.endswith(".mp4")])
	has_metadata = "metadata.jsonl" in files
	results["rollouts"] = {
	"exists": True,
	"mp4_files": mp4_count,
	"has_metadata": has_metadata,
	"total_files": len(files),
	}
	print(f"[test] rollouts: {mp4_count} mp4s, metadata={has_metadata}")
	except Exception as e:
	results["rollouts"] = {"exists": False, "error": str(e)}
	print(f"[test] rollouts FAILED: {e}")

	# Check robomind-loco-judge-dataset
	try:
	files = api.list_repo_files("mitvho09/robomind-loco-judge-dataset")
	results["dataset"] = {
	"exists": True,
	"total_files": len(files),
	}
	print(f"[test] dataset: {len(files)} files")
	except Exception as e:
	results["dataset"] = {"exists": False, "error": str(e)}
	print(f"[test] dataset FAILED: {e}")

	# Check adapter
	try:
	files = api.list_repo_files("mitvho09/robomind-minicpm-loco-lora")
	has_adapter = "adapter_model.safetensors" in files
	has_config = "adapter_config.json" in files
	results["adapter"] = {
	"exists": True,
	"has_adapter_weights": has_adapter,
	"has_config": has_config,
	"total_files": len(files),
	}
	print(f"[test] adapter: weights={has_adapter}, config={has_config}")
	except Exception as e:
	results["adapter"] = {"exists": False, "error": str(e)}
	print(f"[test] adapter FAILED: {e}")

	return results


	@app.function(image=image, timeout=300)
	def test_dataset_structure():
	"""Test 2: Verify dataset has correct schema and content."""
	from datasets import load_dataset

	ds = load_dataset("mitvho09/robomind-loco-judge-dataset", split="train")
	results = {"n_samples": len(ds)}

	required_cols = ["env", "tier", "episode_id", "images", "target_json"]
	for col in required_cols:
	results[f"has_{col}"] = col in ds.column_names

	envs = set(ds["env"])
	tiers = set(ds["tier"])
	results["envs"] = sorted(envs)
	results["tiers"] = sorted(tiers)

	n_images = [len(row["images"]) for row in ds]
	results["min_images"] = min(n_images)
	results["max_images"] = max(n_images)
	results["avg_images"] = sum(n_images) / len(n_images)

	target_jsons = [json.loads(row["target_json"]) for row in ds]
	all_keys = set()
	for tj in target_jsons:
	all_keys.update(tj.keys())
	results["target_json_keys"] = sorted(all_keys)

	print(f"[test] dataset: {results['n_samples']} samples, {len(envs)} envs, {len(tiers)} tiers")
	print(f"[test] images per sample: min={results['min_images']}, max={results['max_images']}")
	print(f"[test] envs: {results['envs']}")
	print(f"[test] target keys: {results['target_json_keys']}")

	return results


	@app.function(
	image=image,
	gpu="A100-40GB",
	volumes={"/data": volume},
	secrets=[modal.Secret.from_name("huggingface-secret")],
	timeout=600,
	)
	def test_model_load():
	"""Test 3: Verify model loads correctly with LoRA adapter."""
	import torch
	from transformers import AutoModel, AutoTokenizer
	from peft import PeftModel
	from huggingface_hub import login
	import os

	hf_token = os.environ.get("HF_TOKEN")
	if hf_token:
	login(token=hf_token)

	results = {}

	try:
	tokenizer = AutoTokenizer.from_pretrained(
	"openbmb/MiniCPM-V-2_6", trust_remote_code=True
	)
	results["tokenizer_loaded"] = True
	print("[test] tokenizer loaded")
	except Exception as e:
	results["tokenizer_loaded"] = False
	results["tokenizer_error"] = str(e)
	print(f"[test] tokenizer FAILED: {e}")
	return results

	try:
	base_model = AutoModel.from_pretrained(
	"openbmb/MiniCPM-V-2_6",
	trust_remote_code=True,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	results["base_model_loaded"] = True
	print("[test] base model loaded")
	except Exception as e:
	results["base_model_loaded"] = False
	results["base_model_error"] = str(e)
	print(f"[test] base model FAILED: {e}")
	return results

	try:
	model = PeftModel.from_pretrained(base_model, "mitvho09/robomind-minicpm-loco-lora")
	results["adapter_loaded"] = True
	print("[test] LoRA adapter loaded")
	except Exception as e:
	results["adapter_loaded"] = False
	results["adapter_error"] = str(e)
	print(f"[test] adapter FAILED: {e}")
	return results

	model.eval()
	results["model_ready"] = True
	print("[test] model ready for inference")
	return results


	@app.function(
	image=image,
	gpu="A100-40GB",
	volumes={"/data": volume},
	secrets=[modal.Secret.from_name("huggingface-secret")],
	timeout=600,
	)
	def test_inference():
	"""Test 4: Run a dummy inference to verify the model generates output."""
	import os
	import torch
	import json
	from transformers import AutoModel, AutoTokenizer
	from peft import PeftModel
	from huggingface_hub import login
	from PIL import Image
	import numpy as np

	hf_token = os.environ.get("HF_TOKEN")
	if hf_token:
	login(token=hf_token)

	results = {}

	tokenizer = AutoTokenizer.from_pretrained(
	"openbmb/MiniCPM-V-2_6", trust_remote_code=True
	)
	base_model = AutoModel.from_pretrained(
	"openbmb/MiniCPM-V-2_6",
	trust_remote_code=True,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	)
	model = PeftModel.from_pretrained(base_model, "mitvho09/robomind-minicpm-loco-lora")
	model.eval()

	dummy_images = [
	Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
	for _ in range(3)
	]

	instruction = (
	"You are RoboMind VLA, a vision-language reward model for humanoid "
	"locomotion. You are shown keyframes from a robot locomotion rollout. "
	"The robot was commanded to \"walk forward\". Analyze the rollout and "
	"respond with ONLY a JSON object with these exact keys: timestep_range, "
	"phase, command, command_followed, stability, fall_risk, gait_quality, "
	"predicted_reward, anomaly, explanation."
	)

	image_tokens = "\n".join(f"<image_{k:02d}>" for k in range(3))
	user_content = f"{image_tokens}\n{instruction}"

	with torch.no_grad():
	output = model.chat(
	image=dummy_images,
	msgs=[{"role": "user", "content": user_content}],
	tokenizer=tokenizer,
	max_new_tokens=256,
	)

	response = output if isinstance(output, str) else str(output)
	results["response_length"] = len(response)
	results["response_preview"] = response[:300]

	has_json = "{" in response and "}" in response
	results["contains_json"] = has_json

	print(f"[test] inference OK: {len(response)} chars")
	print(f"[test] response: {response[:300]}")
	return results


	@app.function(image=image, timeout=300)
	def test_volume_data():
	"""Test 5: Verify Modal volume has expected data."""
	import os

	results = {}

	ft_dir = "/data/ft"
	if os.path.exists(ft_dir):
	images_dir = os.path.join(ft_dir, "images")
	if os.path.exists(images_dir):
	n_images = len([f for f in os.listdir(images_dir) if f.endswith(".jpg")])
	results["ft_images"] = n_images
	print(f"[test] volume images: {n_images}")

	train_json = os.path.join(ft_dir, "train.json")
	if os.path.exists(train_json):
	with open(train_json) as f:
	data = json.load(f)
	results["train_samples"] = len(data)
	print(f"[test] volume train.json: {len(data)} samples")

	lora_dir = os.path.join(ft_dir, "lora_output")
	if os.path.exists(lora_dir):
	results["lora_output_exists"] = True
	has_adapter = any("adapter_model" in f for f in os.listdir(lora_dir))
	results["has_adapter_in_output"] = has_adapter
	print(f"[test] LoRA output exists, adapter={has_adapter}")

	mcpm_dir = "/data/MiniCPM-V"
	results["minicpm_cloned"] = os.path.exists(mcpm_dir)

	print(f"[test] volume data: {results}")
	return results


	@app.local_entrypoint()
	def main():
	"""Run all tests."""
	print("=" * 60)
	print("RoboMind VLA — End-to-End Tests")
	print("=" * 60)

	all_results = {}

	print("\n--- Test 1: HF Data Repos ---")
	all_results["data_repos"] = test_data_repos.remote()

	print("\n--- Test 2: Dataset Structure ---")
	all_results["dataset_structure"] = test_dataset_structure.remote()

	print("\n--- Test 3: Model Load ---")
	all_results["model_load"] = test_model_load.remote()

	print("\n--- Test 4: Inference ---")
	all_results["inference"] = test_inference.remote()

	print("\n--- Test 5: Volume Data ---")
	all_results["volume_data"] = test_volume_data.remote()

	print("\n" + "=" * 60)
	print("SUMMARY")
	print("=" * 60)

	passed = 0
	failed = 0
	for test_name, result in all_results.items():
	has_error = any("error" in str(v).lower() for v in result.values() if isinstance(v, str))
	has_false = any(v is False for v in result.values() if isinstance(v, bool))
	if has_error or has_false:
	status = "FAIL"
	failed += 1
	else:
	status = "PASS"
	passed += 1
	print(f" {test_name}: {status}")

	print(f"\nTotal: {passed} passed, {failed} failed, {passed + failed} total")
	return all_results