Instructions to use NU-World-Model-Embodied-AI/phyjudge-9B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use NU-World-Model-Embodied-AI/phyjudge-9B with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3.5-9B") model = PeftModel.from_pretrained(base_model, "NU-World-Model-Embodied-AI/phyjudge-9B") - Notebooks
- Google Colab
- Kaggle
| """Run inference with the judge LoRA adapter. | |
| The script can either load files from a local directory or pull them | |
| directly from the Hugging Face Hub. By default it points at the | |
| companion repository ``NU-World-Model-Embodied-AI/phyjudge-9B``: | |
| # From the Hub (no clone needed): | |
| python infer.py --video demo.mp4 --caption "A ball rolls down a ramp." --metric SA | |
| python infer.py --video demo.mp4 --caption "A ball rolls down a ramp." --law gravity | |
| # From a local clone of the model repo: | |
| python infer.py --adapter-dir /path/to/local/clone --video demo.mp4 \ | |
| --caption "A ball rolls down a ramp." --law gravity | |
| It loads: | |
| - adapter_config.json to find the base model | |
| - adapter_model.safetensors through PEFT | |
| - subq+human.yaml to render the scoring prompt | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import re | |
| from pathlib import Path | |
| from typing import Any | |
| import torch | |
| import yaml | |
| from peft import PeftModel | |
| from transformers import AutoProcessor | |
| GENERAL_SUB_QUESTIONS: dict[str, list[str]] = { | |
| "SA": [ | |
| "Are the main objects in the caption present in the video?", | |
| "Are the key actions or interactions from the caption visible?", | |
| "Are important scene attributes and relationships preserved?", | |
| "Does the video avoid major contradictions to the caption?", | |
| ], | |
| "PTV": [ | |
| "Do causes appear before their effects?", | |
| "Do physical events unfold in a plausible temporal order?", | |
| "Are motion transitions continuous rather than abrupt jumps or loops?", | |
| "Does the sequence avoid impossible reversals or repeated resets?", | |
| ], | |
| "persistence": [ | |
| "Do objects maintain consistent existence throughout the video?", | |
| "Do objects keep a stable shape, size, color, and texture?", | |
| "Do objects avoid disappearing, appearing, or transforming unexpectedly?", | |
| "Do objects preserve identity through motion and brief occlusion?", | |
| ], | |
| } | |
| PHYSICAL_CRITERIA: dict[str, str] = { | |
| "gravity": "Do unsupported objects fall downward? Do thrown objects follow a curved trajectory? Does poured liquid fall with gravity?", | |
| "inertia": "Do stationary objects remain still unless acted upon? Do moving objects maintain their motion unless stopped by friction, collision, or an obstacle?", | |
| "momentum": "After collision, push, or pull, is the direction of motion reasonable? Ignore speed magnitude.", | |
| "impenetrability": "Do objects maintain impenetrability -- no passing through each other?", | |
| "collision": "After impact, is there reasonable bounce/shatter/deformation? Does response match impact force?", | |
| "material": "Does each material respond according to its properties? (glass shatters, rubber bounces, metal is rigid, cloth deforms softly, etc.)", | |
| "buoyancy": "Do dense objects sink? Do wood/plastic float?", | |
| "displacement": "When you add more liquid or put an object into it, does the liquid level rise in a realistic way? Does it overflow when full?", | |
| "flow_dynamics": "Does the liquid's overall motion behave realistically over time -- flowing along surfaces, spreading, draining naturally?", | |
| "boundary_interaction": "When the liquid hits a boundary such as a rock face, container wall, or floor, does it respond realistically? Do local splash, rebound, or split patterns on impact look physically plausible?", | |
| "fluid_continuity": "Does the liquid avoid disappearing or appearing out of nowhere? Small splashes that briefly break apart are okay.", | |
| "reflection": "Does the reflection roughly match objects and colors in the scene, and avoid completely unrelated content?", | |
| "shadow": "Are shadow directions consistent with light source? Do shadows move with objects?", | |
| } | |
| PHYSICAL_SUB_QUESTIONS: dict[str, list[str]] = { | |
| "gravity": [ | |
| "Do unsupported objects or liquids move downward over time?", | |
| "Do thrown or falling objects follow a plausible gravity-driven path?", | |
| "Does the video avoid objects floating or rising without support?", | |
| ], | |
| "inertia": [ | |
| "Do stationary objects remain still unless a visible force acts on them?", | |
| "Do moving objects continue plausibly until friction, collision, or an obstacle changes their motion?", | |
| "Does the video avoid unexplained starts, stops, or direction changes?", | |
| ], | |
| "momentum": [ | |
| "After contact, push, pull, or collision, are motion directions plausible?", | |
| "Does the reacting object move in a direction consistent with the interaction?", | |
| "Does the video avoid impossible reversals or unrelated motion changes?", | |
| ], | |
| "impenetrability": [ | |
| "Do solid objects avoid passing through one another?", | |
| "Do contacts and overlaps remain physically plausible?", | |
| "Does the video avoid obvious clipping or penetration artifacts?", | |
| ], | |
| "collision": [ | |
| "Does impact cause a plausible bounce, break, deformation, or transfer of motion?", | |
| "Is the response direction consistent with the collision?", | |
| "Does the response avoid being much too weak, too strong, or unrelated to the impact?", | |
| ], | |
| "material": [ | |
| "Do objects respond consistently with their apparent material?", | |
| "Are rigid, soft, brittle, elastic, or fluid-like objects animated appropriately?", | |
| "Does the video avoid material behavior that contradicts the scene?", | |
| ], | |
| "buoyancy": [ | |
| "Do objects sink or float in a way consistent with apparent density?", | |
| "Does the floating or sinking behavior stay stable over time?", | |
| "Does the video avoid unsupported hovering or impossible underwater motion?", | |
| ], | |
| "displacement": [ | |
| "Does liquid level rise when volume is added or an object enters it?", | |
| "Does overflow happen only when the container is plausibly full?", | |
| "Does the liquid volume remain visually plausible?", | |
| ], | |
| "flow_dynamics": [ | |
| "Does liquid flow along surfaces, spread, or drain naturally?", | |
| "Does the flow direction follow gravity and boundaries?", | |
| "Does the video avoid abrupt stops, reversals, or unsupported uphill flow?", | |
| ], | |
| "boundary_interaction": [ | |
| "Does liquid react plausibly when hitting a wall, floor, container, or obstacle?", | |
| "Are splash, rebound, or split patterns locally plausible?", | |
| "Does the liquid remain consistent after interacting with boundaries?", | |
| ], | |
| "fluid_continuity": [ | |
| "Does liquid avoid disappearing or appearing without cause?", | |
| "Does the amount of liquid remain broadly consistent?", | |
| "Are splashes and separations temporary and physically plausible?", | |
| ], | |
| "reflection": [ | |
| "Does the reflection match nearby objects, colors, and motion?", | |
| "Does the reflected content stay spatially consistent with the scene?", | |
| "Does the video avoid unrelated or impossible reflection content?", | |
| ], | |
| "shadow": [ | |
| "Are shadows consistent with the apparent light source direction?", | |
| "Do shadows move with the objects that cast them?", | |
| "Does the video avoid missing, detached, or contradictory shadows?", | |
| ], | |
| } | |
| def load_json(path: Path) -> dict[str, Any]: | |
| with path.open() as f: | |
| return json.load(f) | |
| def load_yaml(path: Path) -> dict[str, Any]: | |
| with path.open() as f: | |
| return yaml.safe_load(f) | |
| def questions_block(questions: list[str]) -> str: | |
| return "\n".join(f"{idx}. {question}" for idx, question in enumerate(questions, 1)) | |
| def build_prompt( | |
| cfg: dict[str, Any], | |
| caption: str, | |
| *, | |
| metric: str | None = None, | |
| law: str | None = None, | |
| criteria: str | None = None, | |
| ) -> tuple[str, str, str]: | |
| if metric: | |
| if metric not in GENERAL_SUB_QUESTIONS: | |
| raise ValueError(f"unknown metric: {metric}") | |
| prompt = cfg["eval_prompts"][metric].format( | |
| prompt=caption, | |
| questions_block=questions_block(GENERAL_SUB_QUESTIONS[metric]), | |
| ) | |
| return cfg["system_prompt"], prompt, metric | |
| if not law: | |
| raise ValueError("either --metric or --law is required") | |
| if law not in PHYSICAL_CRITERIA: | |
| raise ValueError(f"unknown law: {law}") | |
| prompt = cfg["physical_template"].format( | |
| prompt=caption, | |
| law=law, | |
| criteria=criteria or PHYSICAL_CRITERIA[law], | |
| questions_block=questions_block(PHYSICAL_SUB_QUESTIONS[law]), | |
| ) | |
| return cfg["system_prompt"], prompt, law | |
| def load_base_model(base_id: str, dtype: torch.dtype, device_map: str): | |
| errors: list[str] = [] | |
| for class_name in ( | |
| "AutoModelForImageTextToText", | |
| "AutoModelForVision2Seq", | |
| "AutoModelForCausalLM", | |
| ): | |
| try: | |
| module = __import__("transformers", fromlist=[class_name]) | |
| model_cls = getattr(module, class_name) | |
| return model_cls.from_pretrained( | |
| base_id, | |
| torch_dtype=dtype, | |
| device_map=device_map, | |
| trust_remote_code=True, | |
| ) | |
| except Exception as exc: # pragma: no cover - depends on local transformers version | |
| errors.append(f"{class_name}: {exc}") | |
| raise RuntimeError("failed to load base model:\n" + "\n".join(errors)) | |
| def resolve_adapter_dir(source: str) -> Path: | |
| """Return a local directory holding the adapter files. | |
| If ``source`` is a directory containing ``adapter_config.json`` it is used | |
| as-is. Otherwise ``source`` is interpreted as a HF Hub repo id and the | |
| snapshot is downloaded into the local cache. | |
| """ | |
| candidate = Path(source) | |
| if candidate.is_dir() and (candidate / "adapter_config.json").exists(): | |
| return candidate | |
| try: | |
| from huggingface_hub import snapshot_download | |
| except ImportError as exc: | |
| raise ImportError( | |
| "huggingface_hub is required to fetch the adapter from the Hub. " | |
| "Install it with: pip install huggingface_hub" | |
| ) from exc | |
| return Path(snapshot_download(repo_id=source)) | |
| def load_model(adapter_source: str, dtype: torch.dtype, device_map: str) -> tuple[Any, Any, Path]: | |
| adapter_dir = resolve_adapter_dir(adapter_source) | |
| adapter_cfg = load_json(adapter_dir / "adapter_config.json") | |
| base_id = adapter_cfg["base_model_name_or_path"] | |
| processor = AutoProcessor.from_pretrained(base_id, trust_remote_code=True) | |
| base = load_base_model(base_id, dtype=dtype, device_map=device_map) | |
| model = PeftModel.from_pretrained(base, adapter_dir) | |
| model.eval() | |
| return processor, model, adapter_dir | |
| def build_messages(system_prompt: str, user_prompt: str, video_path: Path) -> list[dict[str, Any]]: | |
| return [ | |
| {"role": "system", "content": system_prompt}, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "video", "video": str(video_path)}, | |
| {"type": "text", "text": user_prompt}, | |
| ], | |
| }, | |
| ] | |
| def prepare_inputs( | |
| processor: Any, | |
| messages: list[dict[str, Any]], | |
| device: torch.device, | |
| *, | |
| fps: float, | |
| max_pixels: int, | |
| ) -> dict[str, Any]: | |
| text = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| try: | |
| from qwen_vl_utils import process_vision_info | |
| except ImportError as exc: | |
| raise ImportError( | |
| "qwen-vl-utils is required for local video inference. " | |
| "Install it with: pip install qwen-vl-utils[decord]" | |
| ) from exc | |
| for msg in messages: | |
| content = msg.get("content") | |
| if isinstance(content, list): | |
| for item in content: | |
| if item.get("type") == "video": | |
| item.setdefault("fps", fps) | |
| item.setdefault("max_pixels", max_pixels) | |
| try: | |
| image_inputs, video_inputs, video_kwargs = process_vision_info( | |
| messages, | |
| return_video_kwargs=True, | |
| ) | |
| except TypeError: | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| video_kwargs = {} | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| **video_kwargs, | |
| ) | |
| return inputs.to(device) | |
| def decode_generated(processor: Any, inputs: dict[str, Any], generated_ids: torch.Tensor) -> str: | |
| input_len = inputs["input_ids"].shape[1] | |
| generated_ids = generated_ids[:, input_len:] | |
| return processor.batch_decode( | |
| generated_ids, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False, | |
| )[0].strip() | |
| def parse_score(text: str, key: str) -> int | None: | |
| match = re.search(r"\{.*?\}", text, flags=re.S) | |
| if match: | |
| try: | |
| obj = json.loads(match.group(0)) | |
| value = obj.get(key) | |
| if isinstance(value, int) and 1 <= value <= 5: | |
| return value | |
| except json.JSONDecodeError: | |
| pass | |
| match = re.search(rf'"?{re.escape(key)}"?\s*:\s*([1-5])', text) | |
| if match: | |
| return int(match.group(1)) | |
| return None | |
| def dtype_from_name(name: str) -> torch.dtype: | |
| if name == "bfloat16": | |
| return torch.bfloat16 | |
| if name == "float16": | |
| return torch.float16 | |
| if name == "float32": | |
| return torch.float32 | |
| raise ValueError(f"unsupported dtype: {name}") | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Infer with the judge adapter.") | |
| parser.add_argument( | |
| "--adapter-dir", | |
| default="NU-World-Model-Embodied-AI/phyjudge-9B", | |
| help=( | |
| "Local directory with adapter_config.json + adapter_model.safetensors " | |
| "+ subq+human.yaml, or a HF Hub repo id " | |
| "(default: NU-World-Model-Embodied-AI/phyjudge-9B)." | |
| ), | |
| ) | |
| parser.add_argument("--video", required=True, type=Path) | |
| parser.add_argument("--caption", required=True) | |
| group = parser.add_mutually_exclusive_group(required=True) | |
| group.add_argument("--metric", choices=["SA", "PTV", "persistence"]) | |
| group.add_argument("--law", choices=sorted(PHYSICAL_CRITERIA)) | |
| parser.add_argument("--criteria", help="Override physical-law criterion text.") | |
| parser.add_argument("--max-new-tokens", type=int, default=64) | |
| parser.add_argument("--temperature", type=float, default=0.0) | |
| parser.add_argument("--fps", type=float, default=2.0) | |
| parser.add_argument("--max-pixels", type=int, default=360 * 640) | |
| parser.add_argument("--dtype", choices=["bfloat16", "float16", "float32"], default="bfloat16") | |
| parser.add_argument("--device-map", default="auto") | |
| parser.add_argument("--print-prompt", action="store_true") | |
| args = parser.parse_args() | |
| if not args.video.is_file(): | |
| raise FileNotFoundError(args.video) | |
| dtype = dtype_from_name(args.dtype) | |
| processor, model, adapter_dir = load_model( | |
| args.adapter_dir, dtype=dtype, device_map=args.device_map | |
| ) | |
| prompt_cfg = load_yaml(adapter_dir / "subq+human.yaml") | |
| system_prompt, user_prompt, score_key = build_prompt( | |
| prompt_cfg, | |
| args.caption, | |
| metric=args.metric, | |
| law=args.law, | |
| criteria=args.criteria, | |
| ) | |
| if args.print_prompt: | |
| print("SYSTEM:") | |
| print(system_prompt) | |
| print("\nUSER:") | |
| print(user_prompt) | |
| print() | |
| device = next(model.parameters()).device | |
| messages = build_messages(system_prompt, user_prompt, args.video) | |
| inputs = prepare_inputs( | |
| processor, | |
| messages, | |
| device, | |
| fps=args.fps, | |
| max_pixels=args.max_pixels, | |
| ) | |
| generation_kwargs: dict[str, Any] = { | |
| "max_new_tokens": args.max_new_tokens, | |
| "do_sample": args.temperature > 0, | |
| "temperature": args.temperature if args.temperature > 0 else None, | |
| } | |
| generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None} | |
| with torch.inference_mode(): | |
| generated_ids = model.generate(**inputs, **generation_kwargs) | |
| raw = decode_generated(processor, inputs, generated_ids) | |
| score = parse_score(raw, score_key) | |
| print(json.dumps({"key": score_key, "score": score, "raw": raw}, ensure_ascii=False, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |