| { |
| "schema_version": "0.2", |
| "generated_by": { |
| "tool": "manual-by-agent", |
| "version": "0.2", |
| "agent": "claude-opus-4.6" |
| }, |
| "generated_at": "2026-04-28T12:06:09Z", |
| "stack": "multitask_dit_policy", |
| "input_contract": { |
| "images": [ |
| { |
| "key": "observation.images.front", |
| "aliases": [], |
| "raw_shape": [ |
| 3, |
| 480, |
| 640 |
| ], |
| "encoder_resize": [ |
| 224, |
| 224 |
| ], |
| "crop": null, |
| "color_order": "RGB", |
| "channel_layout": "CHW", |
| "dtype": "float32", |
| "value_range": [ |
| 0.0, |
| 1.0 |
| ], |
| "normalization": { |
| "type": "MEAN_STD", |
| "scope": "VISUAL", |
| "applied_by": "observation_encoder.vision (CLIP ViT-B/16 .openai)" |
| }, |
| "augmentations_in_training": [], |
| "physical_mounting": "front-mounted exterior camera", |
| "camera_serial": null, |
| "camera_usb_path": null, |
| "reference_frame_hash": null, |
| "reference_frame_path": null |
| }, |
| { |
| "key": "observation.images.wrist", |
| "aliases": [], |
| "raw_shape": [ |
| 3, |
| 480, |
| 640 |
| ], |
| "encoder_resize": [ |
| 224, |
| 224 |
| ], |
| "crop": null, |
| "color_order": "RGB", |
| "channel_layout": "CHW", |
| "dtype": "float32", |
| "value_range": [ |
| 0.0, |
| 1.0 |
| ], |
| "normalization": { |
| "type": "MEAN_STD", |
| "scope": "VISUAL", |
| "applied_by": "observation_encoder.vision (CLIP ViT-B/16 .openai)" |
| }, |
| "augmentations_in_training": [], |
| "physical_mounting": "wrist-mounted camera", |
| "camera_serial": null, |
| "camera_usb_path": null, |
| "reference_frame_hash": null, |
| "reference_frame_path": null |
| } |
| ], |
| "state": { |
| "total_dim": 16, |
| "sub_keys": [ |
| { |
| "key": "observation.state", |
| "dim": 7, |
| "convert_rotation": false, |
| "note": "raw 7D joint state (no rotation conversion)" |
| }, |
| { |
| "key": "observation.eef_6d_pose", |
| "dim": 9, |
| "convert_rotation": true, |
| "note": "3D translation + 6D rotation (rot6d) -- expanded from 6D source by rotation conversion" |
| } |
| ], |
| "normalization": { |
| "type": "RAMEN_MIN_MAX", |
| "source": "assets/ramen_stats.json", |
| "stats_dim": 16, |
| "stats_fingerprint": { |
| "file_sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500", |
| "per_dim_q02_at_t0": [ |
| -1.145761489868164, |
| 0.00858306884765625, |
| 0.02193450927734375, |
| -1.216334342956543, |
| -0.12111854553222656, |
| -0.6292438507080078, |
| 0.0008642768952995539, |
| 0.13416780531406403, |
| -0.3562192916870117, |
| 0.02314358949661255, |
| 0.0190497525036335, |
| -0.6936914324760437, |
| -0.9954489469528198, |
| 0.11262556910514832, |
| -0.05564267560839653, |
| -0.1800755113363266 |
| ], |
| "per_dim_q98_at_t0": [ |
| -0.06275272369384766, |
| 2.0300216674804688, |
| 1.7050046920776367, |
| 0.12226295471191406, |
| 0.3057527542114258, |
| 0.7852678298950195, |
| 0.06498069316148758, |
| 0.340210884809494, |
| -0.04877800866961479, |
| 0.2023451328277588, |
| 0.9633501768112183, |
| -0.021005695685744286, |
| -0.030398914590477943, |
| 0.9935086965560913, |
| 0.9875925779342651, |
| 0.14838503301143646 |
| ] |
| } |
| } |
| }, |
| "actions": { |
| "total_dim": 17, |
| "horizon": 32, |
| "sub_keys": [ |
| { |
| "key": "action", |
| "dim": 7, |
| "convert_rotation": false, |
| "note": "raw 7D joint action (deltas, RAMEN-normalized)" |
| }, |
| { |
| "key": "action.eef_pose", |
| "dim": 10, |
| "convert_rotation": true, |
| "note": "3D translation + 6D rotation + 1D gripper -- absolute pose in 6D rotation form, dims 10-15 unnormalized" |
| } |
| ], |
| "norm_mask": [ |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| false, |
| false, |
| false, |
| false, |
| false, |
| false, |
| true |
| ], |
| "delta_dims": { |
| "delta_mask": [ |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| false, |
| false, |
| false, |
| false, |
| false, |
| false, |
| true |
| ], |
| "absolute_dims_reason": "dims 10-15 are 6D rotation (rot6d) passed through unchanged per dataset_schema.rot6d_slice == [10, 16]" |
| }, |
| "normalization": { |
| "type": "RAMEN_MIN_MAX", |
| "source": "assets/ramen_stats.json", |
| "stats_dim": 17, |
| "stats_layout": "(H=32, D=17) per-timestep", |
| "stats_fingerprint": { |
| "file_sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500", |
| "per_dim_q02_at_t0": [ |
| -0.03802967071533203, |
| -0.06523323059082031, |
| -0.059611137956380844, |
| -0.049973487854003906, |
| -0.0247955322265625, |
| -0.058365821838378906, |
| -0.019222989678382874, |
| -0.011416382156312466, |
| -0.012026323936879635, |
| -0.022589389234781265, |
| 0.006374520715326071, |
| -0.6977512240409851, |
| -0.9967345595359802, |
| 0.13310591876506805, |
| -0.06537134945392609, |
| -0.17569203674793243, |
| -0.003581584896892309 |
| ], |
| "per_dim_q98_at_t0": [ |
| 0.02593994140625, |
| 0.062180519104003906, |
| 0.051880836486816406, |
| 0.043107032775878906, |
| 0.026703834533691406, |
| 0.056458473205566406, |
| 0.017502864822745323, |
| 0.009446386247873306, |
| 0.012261614203453064, |
| 0.020949140191078186, |
| 0.8820995092391968, |
| -0.014275601133704185, |
| -0.05388924479484558, |
| 0.9945363998413086, |
| 0.9843086004257202, |
| 0.14365684986114502, |
| 0.06591955572366714 |
| ] |
| } |
| } |
| }, |
| "language": { |
| "tokenizer_class": "CLIPTokenizer", |
| "tokenizer_version": "openai/clip-vit-base-patch16", |
| "max_sequence_length": 77, |
| "default_prompt": "build a block tower", |
| "training_prompts": null |
| }, |
| "temporal": { |
| "n_obs_steps": 2, |
| "observation_delta_indices": [ |
| -1, |
| 0 |
| ], |
| "delta_timestamps": null, |
| "control_rate_hz": null |
| }, |
| "training_datasets": [ |
| { |
| "repo": "pravsels/dit_block_tower_norm_fix", |
| "commit": null, |
| "version": null, |
| "num_episodes": null, |
| "total_frames": null, |
| "episode_filter": null, |
| "sampling_weight": null, |
| "key_rename_map": {}, |
| "delta_timestamps_at_training": null, |
| "contributes_to_norm_stats": true |
| } |
| ] |
| }, |
| "model_identity": { |
| "class_name": "MultiTaskDiTPolicy", |
| "class_module": "multitask_dit_policy.model.model", |
| "config_architectures": [], |
| "resolved_via": "direct_import", |
| "resolved_class_name": "MultiTaskDiTPolicy", |
| "library_versions": { |
| "torch": "2.10.0", |
| "transformers": "5.4.0", |
| "timm": "1.0.26", |
| "safetensors": "0.7.0", |
| "draccus": "0.10.0", |
| "lerobot": "0.5.0", |
| "multitask_dit_policy": "0.1.0" |
| }, |
| "runtime_constraints": { |
| "required_versions": { |
| "transformers": "==5.4.0", |
| "torch": ">=2.10.0", |
| "timm": ">=1.0.26" |
| }, |
| "required_python": ">=3.12,<3.13", |
| "known_incompatible": [ |
| "transformers>=5.5.0 (CLIP key layout change breaks weight loading)" |
| ] |
| }, |
| "python_version": "3.12.3", |
| "cuda_version": "12.8" |
| }, |
| "model_internals": { |
| "module_hierarchy": [ |
| { |
| "name": "", |
| "class": "MultiTaskDiTPolicy", |
| "children": [ |
| { |
| "name": "observation_encoder", |
| "class": "ObservationEncoder" |
| }, |
| { |
| "name": "action_head", |
| "class": "DiffusionActionHead" |
| } |
| ] |
| } |
| ], |
| "parameters": { |
| "summary": { |
| "total_params": 335072273, |
| "trainable_params": 335072273, |
| "frozen_params": 0, |
| "total_bytes": 1340286788, |
| "dtype_breakdown": { |
| "float32": 335072273 |
| } |
| } |
| }, |
| "buffers": [ |
| { |
| "name": "observation_encoder.text_encoder.text_encoder.text_model.embeddings.position_ids", |
| "shape": [ |
| 1, |
| 77 |
| ], |
| "dtype": "int64" |
| } |
| ], |
| "state_dict": { |
| "expected_keys_count": 569, |
| "found_keys_count": 569, |
| "missing_keys": [], |
| "unexpected_keys": [] |
| }, |
| "pretrained_provenance": [ |
| { |
| "submodule": "observation_encoder.vision", |
| "source": "timm", |
| "timm_string": "vit_base_patch16_clip_224.openai", |
| "hf_revision": "timm/vit_base_patch16_clip_224.openai", |
| "frozen_in_training": false, |
| "lr_multiplier": 0.1 |
| }, |
| { |
| "submodule": "observation_encoder.text", |
| "source": "huggingface", |
| "timm_string": null, |
| "hf_revision": "57c216476eefef5ab752ec549e440a49ae4ae5f3", |
| "frozen_in_training": false, |
| "lr_multiplier": null |
| } |
| ], |
| "quantization": { |
| "scheme": "none", |
| "per_tensor_scales": null |
| }, |
| "forward_graph": { |
| "forward_signature": null, |
| "expected_input_keys": [ |
| "observation.images.front", |
| "observation.images.wrist", |
| "observation.state", |
| "task" |
| ], |
| "sample_input_shapes": { |
| "observation.images.front": [ |
| 1, |
| 2, |
| 3, |
| 480, |
| 640 |
| ], |
| "observation.images.wrist": [ |
| 1, |
| 2, |
| 3, |
| 480, |
| 640 |
| ], |
| "observation.state": [ |
| 1, |
| 2, |
| 16 |
| ] |
| }, |
| "sample_output_shapes": { |
| "action": [ |
| 1, |
| 32, |
| 17 |
| ] |
| }, |
| "flops_estimate": null, |
| "peak_memory_inference_b1_bytes": null |
| }, |
| "numerical_health": { |
| "determinism": { |
| "passed": true, |
| "max_abs_diff": 0.0, |
| "loss_value": 2.032438039779663 |
| }, |
| "no_nan_inf": { |
| "passed": true, |
| "loss_value": 2.032438039779663, |
| "action_n_nan": 0, |
| "action_n_inf": 0 |
| }, |
| "dropout_in_eval": { |
| "passed": true, |
| "total_dropout_modules": 0, |
| "still_training": [] |
| }, |
| "bn_running_stats_present": { |
| "passed": true, |
| "total_bn_modules": 0, |
| "with_running_stats": 0 |
| } |
| } |
| }, |
| "output_spec": { |
| "actions": { |
| "layout": "mirrors input_contract.actions", |
| "sub_keys": "see input_contract.actions.sub_keys", |
| "horizon": 32, |
| "control_rate_hz": null, |
| "action_latency_budget_ms": null |
| }, |
| "auxiliary_outputs": { |
| "reward_head": null, |
| "value_head": null, |
| "latents_exposed": false, |
| "attention_maps_exposed": false |
| }, |
| "inference_parameters": { |
| "type": "diffusion", |
| "num_inference_steps": 20, |
| "scheduler": "DDIM", |
| "prediction_type": "epsilon", |
| "clip_sample": true, |
| "clip_sample_range": 1.0, |
| "chunk_aggregation": "first_n_action_steps", |
| "chunks_executed_per_inference": 32, |
| "extra": { |
| "num_train_timesteps": 100, |
| "beta_schedule": "squaredcos_cap_v2", |
| "ramen_clip_value": 1.5 |
| } |
| }, |
| "post_processing": { |
| "unnormalize": true, |
| "delta_to_absolute": { |
| "applies_to_dims": "0-9, 16 (RAMEN delta dims)", |
| "method": "add to current observed state" |
| }, |
| "action_smoothing": null, |
| "action_clamping": null |
| }, |
| "smoke_results": { |
| "calibration_batch_source": "synthetic (torch.rand for images, torch.randn for state, fixed prompt 'build a block tower')", |
| "calibration_batch_size": 2, |
| "determinism": { |
| "status": "pass", |
| "max_abs_diff": 0.0, |
| "method": "two forward passes with torch.manual_seed(0); compare loss tensors via torch.allclose-equivalent diff", |
| "details": {} |
| }, |
| "nan_inf": { |
| "status": "pass", |
| "n_nan": 0, |
| "n_inf": 0, |
| "samples_checked": 1088, |
| "details": {} |
| }, |
| "liveness": { |
| "status": "pass", |
| "std": 0.5793782472610474, |
| "mean": 0.0665111392736435, |
| "criterion": "action std > 1e-5", |
| "details": {} |
| }, |
| "distribution": { |
| "status": null, |
| "ratio_in_acceptable_range": true, |
| "min": -0.9089961647987366, |
| "max": 1.459722876548767, |
| "mean": 0.0665111392736435, |
| "std": 0.5793782472610474, |
| "per_dim_mean": [ |
| 0.48858821392059326, |
| -0.6914477944374084, |
| -0.3306178152561188, |
| 0.29667824506759644, |
| 0.6836938261985779, |
| 0.3917674720287323, |
| -0.43627968430519104, |
| -0.16753537952899933, |
| 0.06529225409030914, |
| 0.3296715021133423, |
| 0.62017822265625, |
| 0.1883048117160797, |
| -0.624245285987854, |
| 0.04247837886214256, |
| 0.8990960717201233, |
| 0.14396880567073822, |
| -0.7689023613929749 |
| ], |
| "per_dim_std": [ |
| 0.44592636823654175, |
| 0.18982458114624023, |
| 0.1246829554438591, |
| 0.21343328058719635, |
| 0.6306062936782837, |
| 0.15469537675380707, |
| 0.17833946645259857, |
| 0.29920586943626404, |
| 0.21930228173732758, |
| 0.12209426611661911, |
| 0.2748163640499115, |
| 0.5151214599609375, |
| 0.1576608568429947, |
| 0.4949493110179901, |
| 0.14426289498806, |
| 0.38943856954574585, |
| 0.14056310057640076 |
| ], |
| "details": {} |
| }, |
| "range_check": { |
| "status": null, |
| "in_expected_range": true, |
| "expected_range": [ |
| -1.5, |
| 1.5 |
| ], |
| "actual_min": -0.9089961647987366, |
| "actual_max": 1.459722876548767, |
| "rationale": "ramen_clip_value=1.5; outputs are RAMEN-normalized actions before unnormalization", |
| "details": {} |
| } |
| } |
| }, |
| "weight_integrity": { |
| "weight_files": [ |
| { |
| "path": "checkpoints/29000/params/model.safetensors", |
| "sha256": "39a8ac32231cedcd6108c65464db51401babf811a489701e01c1772a448d2530", |
| "size_bytes": 1340286788 |
| }, |
| { |
| "path": "checkpoints/29000/params/config.json", |
| "sha256": "f812cd06ce47c3564db0d839f86e5742bc954dba9f9d6b6b732a6f2dda8823a1", |
| "size_bytes": 3037 |
| }, |
| { |
| "path": "assets/ramen_stats.json", |
| "sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500", |
| "size_bytes": 34327 |
| } |
| ], |
| "manifest_hash": null |
| }, |
| "provenance": { |
| "run_log_path": "https://wandb.ai/pravsels/dit_block_tower_norm_fix/runs/ksuxe451", |
| "training_repo": "https://github.com/pravsels/multitask_dit_policy", |
| "training_repo_commit": "af0a43a512841aa1f4d6bb2f93755e5358dca8cb", |
| "config_snapshot_path": "checkpoints/29000/params/config.json", |
| "merged_config_sha256": "f812cd06ce47c3564db0d839f86e5742bc954dba9f9d6b6b732a6f2dda8823a1", |
| "parent_checkpoint": null, |
| "parent_description": null |
| }, |
| "transform_pipeline": [ |
| { |
| "order": 1, |
| "name": "resize_images", |
| "applies_to": "all_images", |
| "operation": "resize", |
| "direction": "input", |
| "parameters": { |
| "target_size": [ |
| 224, |
| 224 |
| ] |
| }, |
| "check_type": "static", |
| "check_description": "compare config.observation_encoder.vision.resize_shape" |
| }, |
| { |
| "order": 2, |
| "name": "imagenet_normalize_images", |
| "applies_to": "all_images", |
| "operation": "imagenet_normalize", |
| "direction": "input", |
| "parameters": { |
| "mean": [ |
| 0.485, |
| 0.456, |
| 0.406 |
| ], |
| "std": [ |
| 0.229, |
| 0.224, |
| 0.225 |
| ] |
| }, |
| "check_type": "static", |
| "check_description": "verify ImageNet mean/std constants match CLIP encoder expectation" |
| }, |
| { |
| "order": 3, |
| "name": "rot6d_expand_state", |
| "applies_to": "state", |
| "operation": "rotation_conversion", |
| "direction": "input", |
| "parameters": { |
| "source_dim": 13, |
| "target_dim": 16, |
| "rot6d_slice": [ |
| 10, |
| 16 |
| ], |
| "source_repr": "rpy_6d", |
| "target_repr": "rot6d_9d" |
| }, |
| "check_type": "static", |
| "check_description": "verify norm_mask + rot6d_slice match config.dataset_schema" |
| }, |
| { |
| "order": 4, |
| "name": "ramen_normalize_state", |
| "applies_to": "state", |
| "operation": "ramen_normalize", |
| "direction": "input", |
| "parameters": { |
| "stats_file": "assets/ramen_stats.json", |
| "clip_value": 1.5, |
| "exempt_dims": [ |
| 10, |
| 11, |
| 12, |
| 13, |
| 14, |
| 15 |
| ] |
| }, |
| "check_type": "static", |
| "check_description": "verify stats file hash + q02/q98 fingerprint at t0" |
| }, |
| { |
| "order": 5, |
| "name": "compute_delta_actions", |
| "applies_to": "action", |
| "operation": "delta", |
| "direction": "input", |
| "parameters": { |
| "method": "action - state on norm_mask'd dims", |
| "delta_mask": [ |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| false, |
| false, |
| false, |
| false, |
| false, |
| false, |
| true |
| ] |
| }, |
| "check_type": "static", |
| "check_description": "verify delta_dims mask matches norm_mask; absolute dims (10-15) are rot6d" |
| }, |
| { |
| "order": 6, |
| "name": "ramen_normalize_actions", |
| "applies_to": "action", |
| "operation": "ramen_normalize", |
| "direction": "input", |
| "parameters": { |
| "stats_file": "assets/ramen_stats.json", |
| "stats_layout": "(H=32, D=17) per-timestep", |
| "clip_value": 1.5, |
| "exempt_dims": [ |
| 10, |
| 11, |
| 12, |
| 13, |
| 14, |
| 15 |
| ] |
| }, |
| "check_type": "static", |
| "check_description": "verify stats file hash + per-timestep q02/q98 fingerprint" |
| }, |
| { |
| "order": 7, |
| "name": "stack_cameras", |
| "applies_to": "all_images", |
| "operation": "stack_cameras", |
| "direction": "input", |
| "parameters": { |
| "key_order": [ |
| "observation.images.front", |
| "observation.images.wrist" |
| ] |
| }, |
| "check_type": "static", |
| "check_description": "verify camera key order matches config.input_features ordering" |
| }, |
| { |
| "order": 8, |
| "name": "temporal_stack", |
| "applies_to": "all", |
| "operation": "temporal_stack", |
| "direction": "input", |
| "parameters": { |
| "n_obs_steps": 2, |
| "observation_delta_indices": [ |
| -1, |
| 0 |
| ] |
| }, |
| "check_type": "static", |
| "check_description": "verify n_obs_steps and delta_indices match config" |
| }, |
| { |
| "order": 9, |
| "name": "clip_tokenize_text", |
| "applies_to": "text", |
| "operation": "tokenize", |
| "direction": "input", |
| "parameters": { |
| "tokenizer": "openai/clip-vit-base-patch16", |
| "max_length": 77 |
| }, |
| "check_type": "static", |
| "check_description": "verify tokenizer model and max_length" |
| }, |
| { |
| "order": 10, |
| "name": "diffusion_forward", |
| "applies_to": "all", |
| "operation": "diffusion_inference", |
| "direction": "input", |
| "parameters": { |
| "num_inference_steps": 20, |
| "scheduler": "DDIM", |
| "prediction_type": "epsilon", |
| "clip_sample_range": 1.0 |
| }, |
| "check_type": "static", |
| "check_description": "verify inference parameters match config.objective" |
| }, |
| { |
| "order": 11, |
| "name": "ramen_unnormalize_actions", |
| "applies_to": "action", |
| "operation": "ramen_unnormalize", |
| "direction": "output", |
| "parameters": { |
| "stats_file": "assets/ramen_stats.json", |
| "inverse_of": "step 6" |
| }, |
| "check_type": "static", |
| "check_description": "verify uses same stats and mask as step 6" |
| }, |
| { |
| "order": 12, |
| "name": "delta_to_absolute", |
| "applies_to": "action", |
| "operation": "delta_to_absolute", |
| "direction": "output", |
| "parameters": { |
| "method": "add current observed state on delta_mask'd dims", |
| "delta_mask": [ |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| true, |
| false, |
| false, |
| false, |
| false, |
| false, |
| false, |
| true |
| ] |
| }, |
| "check_type": "static", |
| "check_description": "verify dims match step 5 delta_mask" |
| } |
| ], |
| "reference_test_vector": null, |
| "norm_round_trip_results": [], |
| "known_issues": [ |
| { |
| "id": "transformers_drift", |
| "severity": "critical", |
| "description": "transformers>=5.5.0 changes CLIP ViT key layout (text_model.encoder vs text_encoder), breaking weight loading", |
| "workaround": "Pin transformers==5.4.0 in runtime environment", |
| "check_type": "static" |
| }, |
| { |
| "id": "ros_pythonpath_leak", |
| "severity": "warning", |
| "description": "/opt/ros paths on PYTHONPATH break Python 3.12 imports (importlib compat)", |
| "workaround": "Unset or filter PYTHONPATH before launching: export PYTHONPATH=$(echo $PYTHONPATH | tr ':' '\\n' | grep -v /opt/ros | paste -sd ':')", |
| "check_type": "static" |
| }, |
| { |
| "id": "ramen_stats_format", |
| "severity": "warning", |
| "description": "Checkpoint uses ramen_stats.json (RAMEN format), not dataset_stats.json (LeRobot format). Loading code must detect format and route accordingly.", |
| "workaround": "Use stats_format='ramen' when constructing adapter; load_ramen_stats() handles the format.", |
| "check_type": "static" |
| } |
| ] |
| } |