dit_block_tower_norm_fix / MODEL_PASSPORT.json
pravsels's picture
Add MODEL_PASSPORT.json (v0.2)
00902c5 verified
{
"schema_version": "0.2",
"generated_by": {
"tool": "manual-by-agent",
"version": "0.2",
"agent": "claude-opus-4.6"
},
"generated_at": "2026-04-28T12:06:09Z",
"stack": "multitask_dit_policy",
"input_contract": {
"images": [
{
"key": "observation.images.front",
"aliases": [],
"raw_shape": [
3,
480,
640
],
"encoder_resize": [
224,
224
],
"crop": null,
"color_order": "RGB",
"channel_layout": "CHW",
"dtype": "float32",
"value_range": [
0.0,
1.0
],
"normalization": {
"type": "MEAN_STD",
"scope": "VISUAL",
"applied_by": "observation_encoder.vision (CLIP ViT-B/16 .openai)"
},
"augmentations_in_training": [],
"physical_mounting": "front-mounted exterior camera",
"camera_serial": null,
"camera_usb_path": null,
"reference_frame_hash": null,
"reference_frame_path": null
},
{
"key": "observation.images.wrist",
"aliases": [],
"raw_shape": [
3,
480,
640
],
"encoder_resize": [
224,
224
],
"crop": null,
"color_order": "RGB",
"channel_layout": "CHW",
"dtype": "float32",
"value_range": [
0.0,
1.0
],
"normalization": {
"type": "MEAN_STD",
"scope": "VISUAL",
"applied_by": "observation_encoder.vision (CLIP ViT-B/16 .openai)"
},
"augmentations_in_training": [],
"physical_mounting": "wrist-mounted camera",
"camera_serial": null,
"camera_usb_path": null,
"reference_frame_hash": null,
"reference_frame_path": null
}
],
"state": {
"total_dim": 16,
"sub_keys": [
{
"key": "observation.state",
"dim": 7,
"convert_rotation": false,
"note": "raw 7D joint state (no rotation conversion)"
},
{
"key": "observation.eef_6d_pose",
"dim": 9,
"convert_rotation": true,
"note": "3D translation + 6D rotation (rot6d) -- expanded from 6D source by rotation conversion"
}
],
"normalization": {
"type": "RAMEN_MIN_MAX",
"source": "assets/ramen_stats.json",
"stats_dim": 16,
"stats_fingerprint": {
"file_sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500",
"per_dim_q02_at_t0": [
-1.145761489868164,
0.00858306884765625,
0.02193450927734375,
-1.216334342956543,
-0.12111854553222656,
-0.6292438507080078,
0.0008642768952995539,
0.13416780531406403,
-0.3562192916870117,
0.02314358949661255,
0.0190497525036335,
-0.6936914324760437,
-0.9954489469528198,
0.11262556910514832,
-0.05564267560839653,
-0.1800755113363266
],
"per_dim_q98_at_t0": [
-0.06275272369384766,
2.0300216674804688,
1.7050046920776367,
0.12226295471191406,
0.3057527542114258,
0.7852678298950195,
0.06498069316148758,
0.340210884809494,
-0.04877800866961479,
0.2023451328277588,
0.9633501768112183,
-0.021005695685744286,
-0.030398914590477943,
0.9935086965560913,
0.9875925779342651,
0.14838503301143646
]
}
}
},
"actions": {
"total_dim": 17,
"horizon": 32,
"sub_keys": [
{
"key": "action",
"dim": 7,
"convert_rotation": false,
"note": "raw 7D joint action (deltas, RAMEN-normalized)"
},
{
"key": "action.eef_pose",
"dim": 10,
"convert_rotation": true,
"note": "3D translation + 6D rotation + 1D gripper -- absolute pose in 6D rotation form, dims 10-15 unnormalized"
}
],
"norm_mask": [
true,
true,
true,
true,
true,
true,
true,
true,
true,
true,
false,
false,
false,
false,
false,
false,
true
],
"delta_dims": {
"delta_mask": [
true,
true,
true,
true,
true,
true,
true,
true,
true,
true,
false,
false,
false,
false,
false,
false,
true
],
"absolute_dims_reason": "dims 10-15 are 6D rotation (rot6d) passed through unchanged per dataset_schema.rot6d_slice == [10, 16]"
},
"normalization": {
"type": "RAMEN_MIN_MAX",
"source": "assets/ramen_stats.json",
"stats_dim": 17,
"stats_layout": "(H=32, D=17) per-timestep",
"stats_fingerprint": {
"file_sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500",
"per_dim_q02_at_t0": [
-0.03802967071533203,
-0.06523323059082031,
-0.059611137956380844,
-0.049973487854003906,
-0.0247955322265625,
-0.058365821838378906,
-0.019222989678382874,
-0.011416382156312466,
-0.012026323936879635,
-0.022589389234781265,
0.006374520715326071,
-0.6977512240409851,
-0.9967345595359802,
0.13310591876506805,
-0.06537134945392609,
-0.17569203674793243,
-0.003581584896892309
],
"per_dim_q98_at_t0": [
0.02593994140625,
0.062180519104003906,
0.051880836486816406,
0.043107032775878906,
0.026703834533691406,
0.056458473205566406,
0.017502864822745323,
0.009446386247873306,
0.012261614203453064,
0.020949140191078186,
0.8820995092391968,
-0.014275601133704185,
-0.05388924479484558,
0.9945363998413086,
0.9843086004257202,
0.14365684986114502,
0.06591955572366714
]
}
}
},
"language": {
"tokenizer_class": "CLIPTokenizer",
"tokenizer_version": "openai/clip-vit-base-patch16",
"max_sequence_length": 77,
"default_prompt": "build a block tower",
"training_prompts": null
},
"temporal": {
"n_obs_steps": 2,
"observation_delta_indices": [
-1,
0
],
"delta_timestamps": null,
"control_rate_hz": null
},
"training_datasets": [
{
"repo": "pravsels/dit_block_tower_norm_fix",
"commit": null,
"version": null,
"num_episodes": null,
"total_frames": null,
"episode_filter": null,
"sampling_weight": null,
"key_rename_map": {},
"delta_timestamps_at_training": null,
"contributes_to_norm_stats": true
}
]
},
"model_identity": {
"class_name": "MultiTaskDiTPolicy",
"class_module": "multitask_dit_policy.model.model",
"config_architectures": [],
"resolved_via": "direct_import",
"resolved_class_name": "MultiTaskDiTPolicy",
"library_versions": {
"torch": "2.10.0",
"transformers": "5.4.0",
"timm": "1.0.26",
"safetensors": "0.7.0",
"draccus": "0.10.0",
"lerobot": "0.5.0",
"multitask_dit_policy": "0.1.0"
},
"runtime_constraints": {
"required_versions": {
"transformers": "==5.4.0",
"torch": ">=2.10.0",
"timm": ">=1.0.26"
},
"required_python": ">=3.12,<3.13",
"known_incompatible": [
"transformers>=5.5.0 (CLIP key layout change breaks weight loading)"
]
},
"python_version": "3.12.3",
"cuda_version": "12.8"
},
"model_internals": {
"module_hierarchy": [
{
"name": "",
"class": "MultiTaskDiTPolicy",
"children": [
{
"name": "observation_encoder",
"class": "ObservationEncoder"
},
{
"name": "action_head",
"class": "DiffusionActionHead"
}
]
}
],
"parameters": {
"summary": {
"total_params": 335072273,
"trainable_params": 335072273,
"frozen_params": 0,
"total_bytes": 1340286788,
"dtype_breakdown": {
"float32": 335072273
}
}
},
"buffers": [
{
"name": "observation_encoder.text_encoder.text_encoder.text_model.embeddings.position_ids",
"shape": [
1,
77
],
"dtype": "int64"
}
],
"state_dict": {
"expected_keys_count": 569,
"found_keys_count": 569,
"missing_keys": [],
"unexpected_keys": []
},
"pretrained_provenance": [
{
"submodule": "observation_encoder.vision",
"source": "timm",
"timm_string": "vit_base_patch16_clip_224.openai",
"hf_revision": "timm/vit_base_patch16_clip_224.openai",
"frozen_in_training": false,
"lr_multiplier": 0.1
},
{
"submodule": "observation_encoder.text",
"source": "huggingface",
"timm_string": null,
"hf_revision": "57c216476eefef5ab752ec549e440a49ae4ae5f3",
"frozen_in_training": false,
"lr_multiplier": null
}
],
"quantization": {
"scheme": "none",
"per_tensor_scales": null
},
"forward_graph": {
"forward_signature": null,
"expected_input_keys": [
"observation.images.front",
"observation.images.wrist",
"observation.state",
"task"
],
"sample_input_shapes": {
"observation.images.front": [
1,
2,
3,
480,
640
],
"observation.images.wrist": [
1,
2,
3,
480,
640
],
"observation.state": [
1,
2,
16
]
},
"sample_output_shapes": {
"action": [
1,
32,
17
]
},
"flops_estimate": null,
"peak_memory_inference_b1_bytes": null
},
"numerical_health": {
"determinism": {
"passed": true,
"max_abs_diff": 0.0,
"loss_value": 2.032438039779663
},
"no_nan_inf": {
"passed": true,
"loss_value": 2.032438039779663,
"action_n_nan": 0,
"action_n_inf": 0
},
"dropout_in_eval": {
"passed": true,
"total_dropout_modules": 0,
"still_training": []
},
"bn_running_stats_present": {
"passed": true,
"total_bn_modules": 0,
"with_running_stats": 0
}
}
},
"output_spec": {
"actions": {
"layout": "mirrors input_contract.actions",
"sub_keys": "see input_contract.actions.sub_keys",
"horizon": 32,
"control_rate_hz": null,
"action_latency_budget_ms": null
},
"auxiliary_outputs": {
"reward_head": null,
"value_head": null,
"latents_exposed": false,
"attention_maps_exposed": false
},
"inference_parameters": {
"type": "diffusion",
"num_inference_steps": 20,
"scheduler": "DDIM",
"prediction_type": "epsilon",
"clip_sample": true,
"clip_sample_range": 1.0,
"chunk_aggregation": "first_n_action_steps",
"chunks_executed_per_inference": 32,
"extra": {
"num_train_timesteps": 100,
"beta_schedule": "squaredcos_cap_v2",
"ramen_clip_value": 1.5
}
},
"post_processing": {
"unnormalize": true,
"delta_to_absolute": {
"applies_to_dims": "0-9, 16 (RAMEN delta dims)",
"method": "add to current observed state"
},
"action_smoothing": null,
"action_clamping": null
},
"smoke_results": {
"calibration_batch_source": "synthetic (torch.rand for images, torch.randn for state, fixed prompt 'build a block tower')",
"calibration_batch_size": 2,
"determinism": {
"status": "pass",
"max_abs_diff": 0.0,
"method": "two forward passes with torch.manual_seed(0); compare loss tensors via torch.allclose-equivalent diff",
"details": {}
},
"nan_inf": {
"status": "pass",
"n_nan": 0,
"n_inf": 0,
"samples_checked": 1088,
"details": {}
},
"liveness": {
"status": "pass",
"std": 0.5793782472610474,
"mean": 0.0665111392736435,
"criterion": "action std > 1e-5",
"details": {}
},
"distribution": {
"status": null,
"ratio_in_acceptable_range": true,
"min": -0.9089961647987366,
"max": 1.459722876548767,
"mean": 0.0665111392736435,
"std": 0.5793782472610474,
"per_dim_mean": [
0.48858821392059326,
-0.6914477944374084,
-0.3306178152561188,
0.29667824506759644,
0.6836938261985779,
0.3917674720287323,
-0.43627968430519104,
-0.16753537952899933,
0.06529225409030914,
0.3296715021133423,
0.62017822265625,
0.1883048117160797,
-0.624245285987854,
0.04247837886214256,
0.8990960717201233,
0.14396880567073822,
-0.7689023613929749
],
"per_dim_std": [
0.44592636823654175,
0.18982458114624023,
0.1246829554438591,
0.21343328058719635,
0.6306062936782837,
0.15469537675380707,
0.17833946645259857,
0.29920586943626404,
0.21930228173732758,
0.12209426611661911,
0.2748163640499115,
0.5151214599609375,
0.1576608568429947,
0.4949493110179901,
0.14426289498806,
0.38943856954574585,
0.14056310057640076
],
"details": {}
},
"range_check": {
"status": null,
"in_expected_range": true,
"expected_range": [
-1.5,
1.5
],
"actual_min": -0.9089961647987366,
"actual_max": 1.459722876548767,
"rationale": "ramen_clip_value=1.5; outputs are RAMEN-normalized actions before unnormalization",
"details": {}
}
}
},
"weight_integrity": {
"weight_files": [
{
"path": "checkpoints/29000/params/model.safetensors",
"sha256": "39a8ac32231cedcd6108c65464db51401babf811a489701e01c1772a448d2530",
"size_bytes": 1340286788
},
{
"path": "checkpoints/29000/params/config.json",
"sha256": "f812cd06ce47c3564db0d839f86e5742bc954dba9f9d6b6b732a6f2dda8823a1",
"size_bytes": 3037
},
{
"path": "assets/ramen_stats.json",
"sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500",
"size_bytes": 34327
}
],
"manifest_hash": null
},
"provenance": {
"run_log_path": "https://wandb.ai/pravsels/dit_block_tower_norm_fix/runs/ksuxe451",
"training_repo": "https://github.com/pravsels/multitask_dit_policy",
"training_repo_commit": "af0a43a512841aa1f4d6bb2f93755e5358dca8cb",
"config_snapshot_path": "checkpoints/29000/params/config.json",
"merged_config_sha256": "f812cd06ce47c3564db0d839f86e5742bc954dba9f9d6b6b732a6f2dda8823a1",
"parent_checkpoint": null,
"parent_description": null
},
"transform_pipeline": [
{
"order": 1,
"name": "resize_images",
"applies_to": "all_images",
"operation": "resize",
"direction": "input",
"parameters": {
"target_size": [
224,
224
]
},
"check_type": "static",
"check_description": "compare config.observation_encoder.vision.resize_shape"
},
{
"order": 2,
"name": "imagenet_normalize_images",
"applies_to": "all_images",
"operation": "imagenet_normalize",
"direction": "input",
"parameters": {
"mean": [
0.485,
0.456,
0.406
],
"std": [
0.229,
0.224,
0.225
]
},
"check_type": "static",
"check_description": "verify ImageNet mean/std constants match CLIP encoder expectation"
},
{
"order": 3,
"name": "rot6d_expand_state",
"applies_to": "state",
"operation": "rotation_conversion",
"direction": "input",
"parameters": {
"source_dim": 13,
"target_dim": 16,
"rot6d_slice": [
10,
16
],
"source_repr": "rpy_6d",
"target_repr": "rot6d_9d"
},
"check_type": "static",
"check_description": "verify norm_mask + rot6d_slice match config.dataset_schema"
},
{
"order": 4,
"name": "ramen_normalize_state",
"applies_to": "state",
"operation": "ramen_normalize",
"direction": "input",
"parameters": {
"stats_file": "assets/ramen_stats.json",
"clip_value": 1.5,
"exempt_dims": [
10,
11,
12,
13,
14,
15
]
},
"check_type": "static",
"check_description": "verify stats file hash + q02/q98 fingerprint at t0"
},
{
"order": 5,
"name": "compute_delta_actions",
"applies_to": "action",
"operation": "delta",
"direction": "input",
"parameters": {
"method": "action - state on norm_mask'd dims",
"delta_mask": [
true,
true,
true,
true,
true,
true,
true,
true,
true,
true,
false,
false,
false,
false,
false,
false,
true
]
},
"check_type": "static",
"check_description": "verify delta_dims mask matches norm_mask; absolute dims (10-15) are rot6d"
},
{
"order": 6,
"name": "ramen_normalize_actions",
"applies_to": "action",
"operation": "ramen_normalize",
"direction": "input",
"parameters": {
"stats_file": "assets/ramen_stats.json",
"stats_layout": "(H=32, D=17) per-timestep",
"clip_value": 1.5,
"exempt_dims": [
10,
11,
12,
13,
14,
15
]
},
"check_type": "static",
"check_description": "verify stats file hash + per-timestep q02/q98 fingerprint"
},
{
"order": 7,
"name": "stack_cameras",
"applies_to": "all_images",
"operation": "stack_cameras",
"direction": "input",
"parameters": {
"key_order": [
"observation.images.front",
"observation.images.wrist"
]
},
"check_type": "static",
"check_description": "verify camera key order matches config.input_features ordering"
},
{
"order": 8,
"name": "temporal_stack",
"applies_to": "all",
"operation": "temporal_stack",
"direction": "input",
"parameters": {
"n_obs_steps": 2,
"observation_delta_indices": [
-1,
0
]
},
"check_type": "static",
"check_description": "verify n_obs_steps and delta_indices match config"
},
{
"order": 9,
"name": "clip_tokenize_text",
"applies_to": "text",
"operation": "tokenize",
"direction": "input",
"parameters": {
"tokenizer": "openai/clip-vit-base-patch16",
"max_length": 77
},
"check_type": "static",
"check_description": "verify tokenizer model and max_length"
},
{
"order": 10,
"name": "diffusion_forward",
"applies_to": "all",
"operation": "diffusion_inference",
"direction": "input",
"parameters": {
"num_inference_steps": 20,
"scheduler": "DDIM",
"prediction_type": "epsilon",
"clip_sample_range": 1.0
},
"check_type": "static",
"check_description": "verify inference parameters match config.objective"
},
{
"order": 11,
"name": "ramen_unnormalize_actions",
"applies_to": "action",
"operation": "ramen_unnormalize",
"direction": "output",
"parameters": {
"stats_file": "assets/ramen_stats.json",
"inverse_of": "step 6"
},
"check_type": "static",
"check_description": "verify uses same stats and mask as step 6"
},
{
"order": 12,
"name": "delta_to_absolute",
"applies_to": "action",
"operation": "delta_to_absolute",
"direction": "output",
"parameters": {
"method": "add current observed state on delta_mask'd dims",
"delta_mask": [
true,
true,
true,
true,
true,
true,
true,
true,
true,
true,
false,
false,
false,
false,
false,
false,
true
]
},
"check_type": "static",
"check_description": "verify dims match step 5 delta_mask"
}
],
"reference_test_vector": null,
"norm_round_trip_results": [],
"known_issues": [
{
"id": "transformers_drift",
"severity": "critical",
"description": "transformers>=5.5.0 changes CLIP ViT key layout (text_model.encoder vs text_encoder), breaking weight loading",
"workaround": "Pin transformers==5.4.0 in runtime environment",
"check_type": "static"
},
{
"id": "ros_pythonpath_leak",
"severity": "warning",
"description": "/opt/ros paths on PYTHONPATH break Python 3.12 imports (importlib compat)",
"workaround": "Unset or filter PYTHONPATH before launching: export PYTHONPATH=$(echo $PYTHONPATH | tr ':' '\\n' | grep -v /opt/ros | paste -sd ':')",
"check_type": "static"
},
{
"id": "ramen_stats_format",
"severity": "warning",
"description": "Checkpoint uses ramen_stats.json (RAMEN format), not dataset_stats.json (LeRobot format). Loading code must detect format and route accordingly.",
"workaround": "Use stats_format='ramen' when constructing adapter; load_ramen_stats() handles the format.",
"check_type": "static"
}
]
}