{ "schema_version": "0.2", "generated_by": { "tool": "manual-by-agent", "version": "0.2", "agent": "claude-opus-4.6" }, "generated_at": "2026-04-28T12:06:09Z", "stack": "multitask_dit_policy", "input_contract": { "images": [ { "key": "observation.images.front", "aliases": [], "raw_shape": [ 3, 480, 640 ], "encoder_resize": [ 224, 224 ], "crop": null, "color_order": "RGB", "channel_layout": "CHW", "dtype": "float32", "value_range": [ 0.0, 1.0 ], "normalization": { "type": "MEAN_STD", "scope": "VISUAL", "applied_by": "observation_encoder.vision (CLIP ViT-B/16 .openai)" }, "augmentations_in_training": [], "physical_mounting": "front-mounted exterior camera", "camera_serial": null, "camera_usb_path": null, "reference_frame_hash": null, "reference_frame_path": null }, { "key": "observation.images.wrist", "aliases": [], "raw_shape": [ 3, 480, 640 ], "encoder_resize": [ 224, 224 ], "crop": null, "color_order": "RGB", "channel_layout": "CHW", "dtype": "float32", "value_range": [ 0.0, 1.0 ], "normalization": { "type": "MEAN_STD", "scope": "VISUAL", "applied_by": "observation_encoder.vision (CLIP ViT-B/16 .openai)" }, "augmentations_in_training": [], "physical_mounting": "wrist-mounted camera", "camera_serial": null, "camera_usb_path": null, "reference_frame_hash": null, "reference_frame_path": null } ], "state": { "total_dim": 16, "sub_keys": [ { "key": "observation.state", "dim": 7, "convert_rotation": false, "note": "raw 7D joint state (no rotation conversion)" }, { "key": "observation.eef_6d_pose", "dim": 9, "convert_rotation": true, "note": "3D translation + 6D rotation (rot6d) -- expanded from 6D source by rotation conversion" } ], "normalization": { "type": "RAMEN_MIN_MAX", "source": "assets/ramen_stats.json", "stats_dim": 16, "stats_fingerprint": { "file_sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500", "per_dim_q02_at_t0": [ -1.145761489868164, 0.00858306884765625, 0.02193450927734375, -1.216334342956543, -0.12111854553222656, -0.6292438507080078, 0.0008642768952995539, 0.13416780531406403, -0.3562192916870117, 0.02314358949661255, 0.0190497525036335, -0.6936914324760437, -0.9954489469528198, 0.11262556910514832, -0.05564267560839653, -0.1800755113363266 ], "per_dim_q98_at_t0": [ -0.06275272369384766, 2.0300216674804688, 1.7050046920776367, 0.12226295471191406, 0.3057527542114258, 0.7852678298950195, 0.06498069316148758, 0.340210884809494, -0.04877800866961479, 0.2023451328277588, 0.9633501768112183, -0.021005695685744286, -0.030398914590477943, 0.9935086965560913, 0.9875925779342651, 0.14838503301143646 ] } } }, "actions": { "total_dim": 17, "horizon": 32, "sub_keys": [ { "key": "action", "dim": 7, "convert_rotation": false, "note": "raw 7D joint action (deltas, RAMEN-normalized)" }, { "key": "action.eef_pose", "dim": 10, "convert_rotation": true, "note": "3D translation + 6D rotation + 1D gripper -- absolute pose in 6D rotation form, dims 10-15 unnormalized" } ], "norm_mask": [ true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, true ], "delta_dims": { "delta_mask": [ true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, true ], "absolute_dims_reason": "dims 10-15 are 6D rotation (rot6d) passed through unchanged per dataset_schema.rot6d_slice == [10, 16]" }, "normalization": { "type": "RAMEN_MIN_MAX", "source": "assets/ramen_stats.json", "stats_dim": 17, "stats_layout": "(H=32, D=17) per-timestep", "stats_fingerprint": { "file_sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500", "per_dim_q02_at_t0": [ -0.03802967071533203, -0.06523323059082031, -0.059611137956380844, -0.049973487854003906, -0.0247955322265625, -0.058365821838378906, -0.019222989678382874, -0.011416382156312466, -0.012026323936879635, -0.022589389234781265, 0.006374520715326071, -0.6977512240409851, -0.9967345595359802, 0.13310591876506805, -0.06537134945392609, -0.17569203674793243, -0.003581584896892309 ], "per_dim_q98_at_t0": [ 0.02593994140625, 0.062180519104003906, 0.051880836486816406, 0.043107032775878906, 0.026703834533691406, 0.056458473205566406, 0.017502864822745323, 0.009446386247873306, 0.012261614203453064, 0.020949140191078186, 0.8820995092391968, -0.014275601133704185, -0.05388924479484558, 0.9945363998413086, 0.9843086004257202, 0.14365684986114502, 0.06591955572366714 ] } } }, "language": { "tokenizer_class": "CLIPTokenizer", "tokenizer_version": "openai/clip-vit-base-patch16", "max_sequence_length": 77, "default_prompt": "build a block tower", "training_prompts": null }, "temporal": { "n_obs_steps": 2, "observation_delta_indices": [ -1, 0 ], "delta_timestamps": null, "control_rate_hz": null }, "training_datasets": [ { "repo": "pravsels/dit_block_tower_norm_fix", "commit": null, "version": null, "num_episodes": null, "total_frames": null, "episode_filter": null, "sampling_weight": null, "key_rename_map": {}, "delta_timestamps_at_training": null, "contributes_to_norm_stats": true } ] }, "model_identity": { "class_name": "MultiTaskDiTPolicy", "class_module": "multitask_dit_policy.model.model", "config_architectures": [], "resolved_via": "direct_import", "resolved_class_name": "MultiTaskDiTPolicy", "library_versions": { "torch": "2.10.0", "transformers": "5.4.0", "timm": "1.0.26", "safetensors": "0.7.0", "draccus": "0.10.0", "lerobot": "0.5.0", "multitask_dit_policy": "0.1.0" }, "runtime_constraints": { "required_versions": { "transformers": "==5.4.0", "torch": ">=2.10.0", "timm": ">=1.0.26" }, "required_python": ">=3.12,<3.13", "known_incompatible": [ "transformers>=5.5.0 (CLIP key layout change breaks weight loading)" ] }, "python_version": "3.12.3", "cuda_version": "12.8" }, "model_internals": { "module_hierarchy": [ { "name": "", "class": "MultiTaskDiTPolicy", "children": [ { "name": "observation_encoder", "class": "ObservationEncoder" }, { "name": "action_head", "class": "DiffusionActionHead" } ] } ], "parameters": { "summary": { "total_params": 335072273, "trainable_params": 335072273, "frozen_params": 0, "total_bytes": 1340286788, "dtype_breakdown": { "float32": 335072273 } } }, "buffers": [ { "name": "observation_encoder.text_encoder.text_encoder.text_model.embeddings.position_ids", "shape": [ 1, 77 ], "dtype": "int64" } ], "state_dict": { "expected_keys_count": 569, "found_keys_count": 569, "missing_keys": [], "unexpected_keys": [] }, "pretrained_provenance": [ { "submodule": "observation_encoder.vision", "source": "timm", "timm_string": "vit_base_patch16_clip_224.openai", "hf_revision": "timm/vit_base_patch16_clip_224.openai", "frozen_in_training": false, "lr_multiplier": 0.1 }, { "submodule": "observation_encoder.text", "source": "huggingface", "timm_string": null, "hf_revision": "57c216476eefef5ab752ec549e440a49ae4ae5f3", "frozen_in_training": false, "lr_multiplier": null } ], "quantization": { "scheme": "none", "per_tensor_scales": null }, "forward_graph": { "forward_signature": null, "expected_input_keys": [ "observation.images.front", "observation.images.wrist", "observation.state", "task" ], "sample_input_shapes": { "observation.images.front": [ 1, 2, 3, 480, 640 ], "observation.images.wrist": [ 1, 2, 3, 480, 640 ], "observation.state": [ 1, 2, 16 ] }, "sample_output_shapes": { "action": [ 1, 32, 17 ] }, "flops_estimate": null, "peak_memory_inference_b1_bytes": null }, "numerical_health": { "determinism": { "passed": true, "max_abs_diff": 0.0, "loss_value": 2.032438039779663 }, "no_nan_inf": { "passed": true, "loss_value": 2.032438039779663, "action_n_nan": 0, "action_n_inf": 0 }, "dropout_in_eval": { "passed": true, "total_dropout_modules": 0, "still_training": [] }, "bn_running_stats_present": { "passed": true, "total_bn_modules": 0, "with_running_stats": 0 } } }, "output_spec": { "actions": { "layout": "mirrors input_contract.actions", "sub_keys": "see input_contract.actions.sub_keys", "horizon": 32, "control_rate_hz": null, "action_latency_budget_ms": null }, "auxiliary_outputs": { "reward_head": null, "value_head": null, "latents_exposed": false, "attention_maps_exposed": false }, "inference_parameters": { "type": "diffusion", "num_inference_steps": 20, "scheduler": "DDIM", "prediction_type": "epsilon", "clip_sample": true, "clip_sample_range": 1.0, "chunk_aggregation": "first_n_action_steps", "chunks_executed_per_inference": 32, "extra": { "num_train_timesteps": 100, "beta_schedule": "squaredcos_cap_v2", "ramen_clip_value": 1.5 } }, "post_processing": { "unnormalize": true, "delta_to_absolute": { "applies_to_dims": "0-9, 16 (RAMEN delta dims)", "method": "add to current observed state" }, "action_smoothing": null, "action_clamping": null }, "smoke_results": { "calibration_batch_source": "synthetic (torch.rand for images, torch.randn for state, fixed prompt 'build a block tower')", "calibration_batch_size": 2, "determinism": { "status": "pass", "max_abs_diff": 0.0, "method": "two forward passes with torch.manual_seed(0); compare loss tensors via torch.allclose-equivalent diff", "details": {} }, "nan_inf": { "status": "pass", "n_nan": 0, "n_inf": 0, "samples_checked": 1088, "details": {} }, "liveness": { "status": "pass", "std": 0.5793782472610474, "mean": 0.0665111392736435, "criterion": "action std > 1e-5", "details": {} }, "distribution": { "status": null, "ratio_in_acceptable_range": true, "min": -0.9089961647987366, "max": 1.459722876548767, "mean": 0.0665111392736435, "std": 0.5793782472610474, "per_dim_mean": [ 0.48858821392059326, -0.6914477944374084, -0.3306178152561188, 0.29667824506759644, 0.6836938261985779, 0.3917674720287323, -0.43627968430519104, -0.16753537952899933, 0.06529225409030914, 0.3296715021133423, 0.62017822265625, 0.1883048117160797, -0.624245285987854, 0.04247837886214256, 0.8990960717201233, 0.14396880567073822, -0.7689023613929749 ], "per_dim_std": [ 0.44592636823654175, 0.18982458114624023, 0.1246829554438591, 0.21343328058719635, 0.6306062936782837, 0.15469537675380707, 0.17833946645259857, 0.29920586943626404, 0.21930228173732758, 0.12209426611661911, 0.2748163640499115, 0.5151214599609375, 0.1576608568429947, 0.4949493110179901, 0.14426289498806, 0.38943856954574585, 0.14056310057640076 ], "details": {} }, "range_check": { "status": null, "in_expected_range": true, "expected_range": [ -1.5, 1.5 ], "actual_min": -0.9089961647987366, "actual_max": 1.459722876548767, "rationale": "ramen_clip_value=1.5; outputs are RAMEN-normalized actions before unnormalization", "details": {} } } }, "weight_integrity": { "weight_files": [ { "path": "checkpoints/29000/params/model.safetensors", "sha256": "39a8ac32231cedcd6108c65464db51401babf811a489701e01c1772a448d2530", "size_bytes": 1340286788 }, { "path": "checkpoints/29000/params/config.json", "sha256": "f812cd06ce47c3564db0d839f86e5742bc954dba9f9d6b6b732a6f2dda8823a1", "size_bytes": 3037 }, { "path": "assets/ramen_stats.json", "sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500", "size_bytes": 34327 } ], "manifest_hash": null }, "provenance": { "run_log_path": "https://wandb.ai/pravsels/dit_block_tower_norm_fix/runs/ksuxe451", "training_repo": "https://github.com/pravsels/multitask_dit_policy", "training_repo_commit": "af0a43a512841aa1f4d6bb2f93755e5358dca8cb", "config_snapshot_path": "checkpoints/29000/params/config.json", "merged_config_sha256": "f812cd06ce47c3564db0d839f86e5742bc954dba9f9d6b6b732a6f2dda8823a1", "parent_checkpoint": null, "parent_description": null }, "transform_pipeline": [ { "order": 1, "name": "resize_images", "applies_to": "all_images", "operation": "resize", "direction": "input", "parameters": { "target_size": [ 224, 224 ] }, "check_type": "static", "check_description": "compare config.observation_encoder.vision.resize_shape" }, { "order": 2, "name": "imagenet_normalize_images", "applies_to": "all_images", "operation": "imagenet_normalize", "direction": "input", "parameters": { "mean": [ 0.485, 0.456, 0.406 ], "std": [ 0.229, 0.224, 0.225 ] }, "check_type": "static", "check_description": "verify ImageNet mean/std constants match CLIP encoder expectation" }, { "order": 3, "name": "rot6d_expand_state", "applies_to": "state", "operation": "rotation_conversion", "direction": "input", "parameters": { "source_dim": 13, "target_dim": 16, "rot6d_slice": [ 10, 16 ], "source_repr": "rpy_6d", "target_repr": "rot6d_9d" }, "check_type": "static", "check_description": "verify norm_mask + rot6d_slice match config.dataset_schema" }, { "order": 4, "name": "ramen_normalize_state", "applies_to": "state", "operation": "ramen_normalize", "direction": "input", "parameters": { "stats_file": "assets/ramen_stats.json", "clip_value": 1.5, "exempt_dims": [ 10, 11, 12, 13, 14, 15 ] }, "check_type": "static", "check_description": "verify stats file hash + q02/q98 fingerprint at t0" }, { "order": 5, "name": "compute_delta_actions", "applies_to": "action", "operation": "delta", "direction": "input", "parameters": { "method": "action - state on norm_mask'd dims", "delta_mask": [ true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, true ] }, "check_type": "static", "check_description": "verify delta_dims mask matches norm_mask; absolute dims (10-15) are rot6d" }, { "order": 6, "name": "ramen_normalize_actions", "applies_to": "action", "operation": "ramen_normalize", "direction": "input", "parameters": { "stats_file": "assets/ramen_stats.json", "stats_layout": "(H=32, D=17) per-timestep", "clip_value": 1.5, "exempt_dims": [ 10, 11, 12, 13, 14, 15 ] }, "check_type": "static", "check_description": "verify stats file hash + per-timestep q02/q98 fingerprint" }, { "order": 7, "name": "stack_cameras", "applies_to": "all_images", "operation": "stack_cameras", "direction": "input", "parameters": { "key_order": [ "observation.images.front", "observation.images.wrist" ] }, "check_type": "static", "check_description": "verify camera key order matches config.input_features ordering" }, { "order": 8, "name": "temporal_stack", "applies_to": "all", "operation": "temporal_stack", "direction": "input", "parameters": { "n_obs_steps": 2, "observation_delta_indices": [ -1, 0 ] }, "check_type": "static", "check_description": "verify n_obs_steps and delta_indices match config" }, { "order": 9, "name": "clip_tokenize_text", "applies_to": "text", "operation": "tokenize", "direction": "input", "parameters": { "tokenizer": "openai/clip-vit-base-patch16", "max_length": 77 }, "check_type": "static", "check_description": "verify tokenizer model and max_length" }, { "order": 10, "name": "diffusion_forward", "applies_to": "all", "operation": "diffusion_inference", "direction": "input", "parameters": { "num_inference_steps": 20, "scheduler": "DDIM", "prediction_type": "epsilon", "clip_sample_range": 1.0 }, "check_type": "static", "check_description": "verify inference parameters match config.objective" }, { "order": 11, "name": "ramen_unnormalize_actions", "applies_to": "action", "operation": "ramen_unnormalize", "direction": "output", "parameters": { "stats_file": "assets/ramen_stats.json", "inverse_of": "step 6" }, "check_type": "static", "check_description": "verify uses same stats and mask as step 6" }, { "order": 12, "name": "delta_to_absolute", "applies_to": "action", "operation": "delta_to_absolute", "direction": "output", "parameters": { "method": "add current observed state on delta_mask'd dims", "delta_mask": [ true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, true ] }, "check_type": "static", "check_description": "verify dims match step 5 delta_mask" } ], "reference_test_vector": null, "norm_round_trip_results": [], "known_issues": [ { "id": "transformers_drift", "severity": "critical", "description": "transformers>=5.5.0 changes CLIP ViT key layout (text_model.encoder vs text_encoder), breaking weight loading", "workaround": "Pin transformers==5.4.0 in runtime environment", "check_type": "static" }, { "id": "ros_pythonpath_leak", "severity": "warning", "description": "/opt/ros paths on PYTHONPATH break Python 3.12 imports (importlib compat)", "workaround": "Unset or filter PYTHONPATH before launching: export PYTHONPATH=$(echo $PYTHONPATH | tr ':' '\\n' | grep -v /opt/ros | paste -sd ':')", "check_type": "static" }, { "id": "ramen_stats_format", "severity": "warning", "description": "Checkpoint uses ramen_stats.json (RAMEN format), not dataset_stats.json (LeRobot format). Loading code must detect format and route accordingly.", "workaround": "Use stats_format='ramen' when constructing adapter; load_ramen_stats() handles the format.", "check_type": "static" } ] }