dit_block_tower_norm_fix / MODEL_PASSPORT.json

Add MODEL_PASSPORT.json (v0.2)

00902c5 verified 19 days ago

23 kB

	{
	"schema_version": "0.2",
	"generated_by": {
	"tool": "manual-by-agent",
	"version": "0.2",
	"agent": "claude-opus-4.6"
	},
	"generated_at": "2026-04-28T12:06:09Z",
	"stack": "multitask_dit_policy",
	"input_contract": {
	"images": [
	{
	"key": "observation.images.front",
	"aliases": [],
	"raw_shape": [
	3,
	480,
	640
	],
	"encoder_resize": [
	224,
	224
	],
	"crop": null,
	"color_order": "RGB",
	"channel_layout": "CHW",
	"dtype": "float32",
	"value_range": [
	0.0,
	1.0
	],
	"normalization": {
	"type": "MEAN_STD",
	"scope": "VISUAL",
	"applied_by": "observation_encoder.vision (CLIP ViT-B/16 .openai)"
	},
	"augmentations_in_training": [],
	"physical_mounting": "front-mounted exterior camera",
	"camera_serial": null,
	"camera_usb_path": null,
	"reference_frame_hash": null,
	"reference_frame_path": null
	},
	{
	"key": "observation.images.wrist",
	"aliases": [],
	"raw_shape": [
	3,
	480,
	640
	],
	"encoder_resize": [
	224,
	224
	],
	"crop": null,
	"color_order": "RGB",
	"channel_layout": "CHW",
	"dtype": "float32",
	"value_range": [
	0.0,
	1.0
	],
	"normalization": {
	"type": "MEAN_STD",
	"scope": "VISUAL",
	"applied_by": "observation_encoder.vision (CLIP ViT-B/16 .openai)"
	},
	"augmentations_in_training": [],
	"physical_mounting": "wrist-mounted camera",
	"camera_serial": null,
	"camera_usb_path": null,
	"reference_frame_hash": null,
	"reference_frame_path": null
	}
	],
	"state": {
	"total_dim": 16,
	"sub_keys": [
	{
	"key": "observation.state",
	"dim": 7,
	"convert_rotation": false,
	"note": "raw 7D joint state (no rotation conversion)"
	},
	{
	"key": "observation.eef_6d_pose",
	"dim": 9,
	"convert_rotation": true,
	"note": "3D translation + 6D rotation (rot6d) -- expanded from 6D source by rotation conversion"
	}
	],
	"normalization": {
	"type": "RAMEN_MIN_MAX",
	"source": "assets/ramen_stats.json",
	"stats_dim": 16,
	"stats_fingerprint": {
	"file_sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500",
	"per_dim_q02_at_t0": [
	-1.145761489868164,
	0.00858306884765625,
	0.02193450927734375,
	-1.216334342956543,
	-0.12111854553222656,
	-0.6292438507080078,
	0.0008642768952995539,
	0.13416780531406403,
	-0.3562192916870117,
	0.02314358949661255,
	0.0190497525036335,
	-0.6936914324760437,
	-0.9954489469528198,
	0.11262556910514832,
	-0.05564267560839653,
	-0.1800755113363266
	],
	"per_dim_q98_at_t0": [
	-0.06275272369384766,
	2.0300216674804688,
	1.7050046920776367,
	0.12226295471191406,
	0.3057527542114258,
	0.7852678298950195,
	0.06498069316148758,
	0.340210884809494,
	-0.04877800866961479,
	0.2023451328277588,
	0.9633501768112183,
	-0.021005695685744286,
	-0.030398914590477943,
	0.9935086965560913,
	0.9875925779342651,
	0.14838503301143646
	]
	}
	}
	},
	"actions": {
	"total_dim": 17,
	"horizon": 32,
	"sub_keys": [
	{
	"key": "action",
	"dim": 7,
	"convert_rotation": false,
	"note": "raw 7D joint action (deltas, RAMEN-normalized)"
	},
	{
	"key": "action.eef_pose",
	"dim": 10,
	"convert_rotation": true,
	"note": "3D translation + 6D rotation + 1D gripper -- absolute pose in 6D rotation form, dims 10-15 unnormalized"
	}
	],
	"norm_mask": [
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	false,
	false,
	false,
	false,
	false,
	false,
	true
	],
	"delta_dims": {
	"delta_mask": [
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	false,
	false,
	false,
	false,
	false,
	false,
	true
	],
	"absolute_dims_reason": "dims 10-15 are 6D rotation (rot6d) passed through unchanged per dataset_schema.rot6d_slice == [10, 16]"
	},
	"normalization": {
	"type": "RAMEN_MIN_MAX",
	"source": "assets/ramen_stats.json",
	"stats_dim": 17,
	"stats_layout": "(H=32, D=17) per-timestep",
	"stats_fingerprint": {
	"file_sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500",
	"per_dim_q02_at_t0": [
	-0.03802967071533203,
	-0.06523323059082031,
	-0.059611137956380844,
	-0.049973487854003906,
	-0.0247955322265625,
	-0.058365821838378906,
	-0.019222989678382874,
	-0.011416382156312466,
	-0.012026323936879635,
	-0.022589389234781265,
	0.006374520715326071,
	-0.6977512240409851,
	-0.9967345595359802,
	0.13310591876506805,
	-0.06537134945392609,
	-0.17569203674793243,
	-0.003581584896892309
	],
	"per_dim_q98_at_t0": [
	0.02593994140625,
	0.062180519104003906,
	0.051880836486816406,
	0.043107032775878906,
	0.026703834533691406,
	0.056458473205566406,
	0.017502864822745323,
	0.009446386247873306,
	0.012261614203453064,
	0.020949140191078186,
	0.8820995092391968,
	-0.014275601133704185,
	-0.05388924479484558,
	0.9945363998413086,
	0.9843086004257202,
	0.14365684986114502,
	0.06591955572366714
	]
	}
	}
	},
	"language": {
	"tokenizer_class": "CLIPTokenizer",
	"tokenizer_version": "openai/clip-vit-base-patch16",
	"max_sequence_length": 77,
	"default_prompt": "build a block tower",
	"training_prompts": null
	},
	"temporal": {
	"n_obs_steps": 2,
	"observation_delta_indices": [
	-1,
	0
	],
	"delta_timestamps": null,
	"control_rate_hz": null
	},
	"training_datasets": [
	{
	"repo": "pravsels/dit_block_tower_norm_fix",
	"commit": null,
	"version": null,
	"num_episodes": null,
	"total_frames": null,
	"episode_filter": null,
	"sampling_weight": null,
	"key_rename_map": {},
	"delta_timestamps_at_training": null,
	"contributes_to_norm_stats": true
	}
	]
	},
	"model_identity": {
	"class_name": "MultiTaskDiTPolicy",
	"class_module": "multitask_dit_policy.model.model",
	"config_architectures": [],
	"resolved_via": "direct_import",
	"resolved_class_name": "MultiTaskDiTPolicy",
	"library_versions": {
	"torch": "2.10.0",
	"transformers": "5.4.0",
	"timm": "1.0.26",
	"safetensors": "0.7.0",
	"draccus": "0.10.0",
	"lerobot": "0.5.0",
	"multitask_dit_policy": "0.1.0"
	},
	"runtime_constraints": {
	"required_versions": {
	"transformers": "==5.4.0",
	"torch": ">=2.10.0",
	"timm": ">=1.0.26"
	},
	"required_python": ">=3.12,<3.13",
	"known_incompatible": [
	"transformers>=5.5.0 (CLIP key layout change breaks weight loading)"
	]
	},
	"python_version": "3.12.3",
	"cuda_version": "12.8"
	},
	"model_internals": {
	"module_hierarchy": [
	{
	"name": "",
	"class": "MultiTaskDiTPolicy",
	"children": [
	{
	"name": "observation_encoder",
	"class": "ObservationEncoder"
	},
	{
	"name": "action_head",
	"class": "DiffusionActionHead"
	}
	]
	}
	],
	"parameters": {
	"summary": {
	"total_params": 335072273,
	"trainable_params": 335072273,
	"frozen_params": 0,
	"total_bytes": 1340286788,
	"dtype_breakdown": {
	"float32": 335072273
	}
	}
	},
	"buffers": [
	{
	"name": "observation_encoder.text_encoder.text_encoder.text_model.embeddings.position_ids",
	"shape": [
	1,
	77
	],
	"dtype": "int64"
	}
	],
	"state_dict": {
	"expected_keys_count": 569,
	"found_keys_count": 569,
	"missing_keys": [],
	"unexpected_keys": []
	},
	"pretrained_provenance": [
	{
	"submodule": "observation_encoder.vision",
	"source": "timm",
	"timm_string": "vit_base_patch16_clip_224.openai",
	"hf_revision": "timm/vit_base_patch16_clip_224.openai",
	"frozen_in_training": false,
	"lr_multiplier": 0.1
	},
	{
	"submodule": "observation_encoder.text",
	"source": "huggingface",
	"timm_string": null,
	"hf_revision": "57c216476eefef5ab752ec549e440a49ae4ae5f3",
	"frozen_in_training": false,
	"lr_multiplier": null
	}
	],
	"quantization": {
	"scheme": "none",
	"per_tensor_scales": null
	},
	"forward_graph": {
	"forward_signature": null,
	"expected_input_keys": [
	"observation.images.front",
	"observation.images.wrist",
	"observation.state",
	"task"
	],
	"sample_input_shapes": {
	"observation.images.front": [
	1,
	2,
	3,
	480,
	640
	],
	"observation.images.wrist": [
	1,
	2,
	3,
	480,
	640
	],
	"observation.state": [
	1,
	2,
	16
	]
	},
	"sample_output_shapes": {
	"action": [
	1,
	32,
	17
	]
	},
	"flops_estimate": null,
	"peak_memory_inference_b1_bytes": null
	},
	"numerical_health": {
	"determinism": {
	"passed": true,
	"max_abs_diff": 0.0,
	"loss_value": 2.032438039779663
	},
	"no_nan_inf": {
	"passed": true,
	"loss_value": 2.032438039779663,
	"action_n_nan": 0,
	"action_n_inf": 0
	},
	"dropout_in_eval": {
	"passed": true,
	"total_dropout_modules": 0,
	"still_training": []
	},
	"bn_running_stats_present": {
	"passed": true,
	"total_bn_modules": 0,
	"with_running_stats": 0
	}
	}
	},
	"output_spec": {
	"actions": {
	"layout": "mirrors input_contract.actions",
	"sub_keys": "see input_contract.actions.sub_keys",
	"horizon": 32,
	"control_rate_hz": null,
	"action_latency_budget_ms": null
	},
	"auxiliary_outputs": {
	"reward_head": null,
	"value_head": null,
	"latents_exposed": false,
	"attention_maps_exposed": false
	},
	"inference_parameters": {
	"type": "diffusion",
	"num_inference_steps": 20,
	"scheduler": "DDIM",
	"prediction_type": "epsilon",
	"clip_sample": true,
	"clip_sample_range": 1.0,
	"chunk_aggregation": "first_n_action_steps",
	"chunks_executed_per_inference": 32,
	"extra": {
	"num_train_timesteps": 100,
	"beta_schedule": "squaredcos_cap_v2",
	"ramen_clip_value": 1.5
	}
	},
	"post_processing": {
	"unnormalize": true,
	"delta_to_absolute": {
	"applies_to_dims": "0-9, 16 (RAMEN delta dims)",
	"method": "add to current observed state"
	},
	"action_smoothing": null,
	"action_clamping": null
	},
	"smoke_results": {
	"calibration_batch_source": "synthetic (torch.rand for images, torch.randn for state, fixed prompt 'build a block tower')",
	"calibration_batch_size": 2,
	"determinism": {
	"status": "pass",
	"max_abs_diff": 0.0,
	"method": "two forward passes with torch.manual_seed(0); compare loss tensors via torch.allclose-equivalent diff",
	"details": {}
	},
	"nan_inf": {
	"status": "pass",
	"n_nan": 0,
	"n_inf": 0,
	"samples_checked": 1088,
	"details": {}
	},
	"liveness": {
	"status": "pass",
	"std": 0.5793782472610474,
	"mean": 0.0665111392736435,
	"criterion": "action std > 1e-5",
	"details": {}
	},
	"distribution": {
	"status": null,
	"ratio_in_acceptable_range": true,
	"min": -0.9089961647987366,
	"max": 1.459722876548767,
	"mean": 0.0665111392736435,
	"std": 0.5793782472610474,
	"per_dim_mean": [
	0.48858821392059326,
	-0.6914477944374084,
	-0.3306178152561188,
	0.29667824506759644,
	0.6836938261985779,
	0.3917674720287323,
	-0.43627968430519104,
	-0.16753537952899933,
	0.06529225409030914,
	0.3296715021133423,
	0.62017822265625,
	0.1883048117160797,
	-0.624245285987854,
	0.04247837886214256,
	0.8990960717201233,
	0.14396880567073822,
	-0.7689023613929749
	],
	"per_dim_std": [
	0.44592636823654175,
	0.18982458114624023,
	0.1246829554438591,
	0.21343328058719635,
	0.6306062936782837,
	0.15469537675380707,
	0.17833946645259857,
	0.29920586943626404,
	0.21930228173732758,
	0.12209426611661911,
	0.2748163640499115,
	0.5151214599609375,
	0.1576608568429947,
	0.4949493110179901,
	0.14426289498806,
	0.38943856954574585,
	0.14056310057640076
	],
	"details": {}
	},
	"range_check": {
	"status": null,
	"in_expected_range": true,
	"expected_range": [
	-1.5,
	1.5
	],
	"actual_min": -0.9089961647987366,
	"actual_max": 1.459722876548767,
	"rationale": "ramen_clip_value=1.5; outputs are RAMEN-normalized actions before unnormalization",
	"details": {}
	}
	}
	},
	"weight_integrity": {
	"weight_files": [
	{
	"path": "checkpoints/29000/params/model.safetensors",
	"sha256": "39a8ac32231cedcd6108c65464db51401babf811a489701e01c1772a448d2530",
	"size_bytes": 1340286788
	},
	{
	"path": "checkpoints/29000/params/config.json",
	"sha256": "f812cd06ce47c3564db0d839f86e5742bc954dba9f9d6b6b732a6f2dda8823a1",
	"size_bytes": 3037
	},
	{
	"path": "assets/ramen_stats.json",
	"sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500",
	"size_bytes": 34327
	}
	],
	"manifest_hash": null
	},
	"provenance": {
	"run_log_path": "https://wandb.ai/pravsels/dit_block_tower_norm_fix/runs/ksuxe451",
	"training_repo": "https://github.com/pravsels/multitask_dit_policy",
	"training_repo_commit": "af0a43a512841aa1f4d6bb2f93755e5358dca8cb",
	"config_snapshot_path": "checkpoints/29000/params/config.json",
	"merged_config_sha256": "f812cd06ce47c3564db0d839f86e5742bc954dba9f9d6b6b732a6f2dda8823a1",
	"parent_checkpoint": null,
	"parent_description": null
	},
	"transform_pipeline": [
	{
	"order": 1,
	"name": "resize_images",
	"applies_to": "all_images",
	"operation": "resize",
	"direction": "input",
	"parameters": {
	"target_size": [
	224,
	224
	]
	},
	"check_type": "static",
	"check_description": "compare config.observation_encoder.vision.resize_shape"
	},
	{
	"order": 2,
	"name": "imagenet_normalize_images",
	"applies_to": "all_images",
	"operation": "imagenet_normalize",
	"direction": "input",
	"parameters": {
	"mean": [
	0.485,
	0.456,
	0.406
	],
	"std": [
	0.229,
	0.224,
	0.225
	]
	},
	"check_type": "static",
	"check_description": "verify ImageNet mean/std constants match CLIP encoder expectation"
	},
	{
	"order": 3,
	"name": "rot6d_expand_state",
	"applies_to": "state",
	"operation": "rotation_conversion",
	"direction": "input",
	"parameters": {
	"source_dim": 13,
	"target_dim": 16,
	"rot6d_slice": [
	10,
	16
	],
	"source_repr": "rpy_6d",
	"target_repr": "rot6d_9d"
	},
	"check_type": "static",
	"check_description": "verify norm_mask + rot6d_slice match config.dataset_schema"
	},
	{
	"order": 4,
	"name": "ramen_normalize_state",
	"applies_to": "state",
	"operation": "ramen_normalize",
	"direction": "input",
	"parameters": {
	"stats_file": "assets/ramen_stats.json",
	"clip_value": 1.5,
	"exempt_dims": [
	10,
	11,
	12,
	13,
	14,
	15
	]
	},
	"check_type": "static",
	"check_description": "verify stats file hash + q02/q98 fingerprint at t0"
	},
	{
	"order": 5,
	"name": "compute_delta_actions",
	"applies_to": "action",
	"operation": "delta",
	"direction": "input",
	"parameters": {
	"method": "action - state on norm_mask'd dims",
	"delta_mask": [
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	false,
	false,
	false,
	false,
	false,
	false,
	true
	]
	},
	"check_type": "static",
	"check_description": "verify delta_dims mask matches norm_mask; absolute dims (10-15) are rot6d"
	},
	{
	"order": 6,
	"name": "ramen_normalize_actions",
	"applies_to": "action",
	"operation": "ramen_normalize",
	"direction": "input",
	"parameters": {
	"stats_file": "assets/ramen_stats.json",
	"stats_layout": "(H=32, D=17) per-timestep",
	"clip_value": 1.5,
	"exempt_dims": [
	10,
	11,
	12,
	13,
	14,
	15
	]
	},
	"check_type": "static",
	"check_description": "verify stats file hash + per-timestep q02/q98 fingerprint"
	},
	{
	"order": 7,
	"name": "stack_cameras",
	"applies_to": "all_images",
	"operation": "stack_cameras",
	"direction": "input",
	"parameters": {
	"key_order": [
	"observation.images.front",
	"observation.images.wrist"
	]
	},
	"check_type": "static",
	"check_description": "verify camera key order matches config.input_features ordering"
	},
	{
	"order": 8,
	"name": "temporal_stack",
	"applies_to": "all",
	"operation": "temporal_stack",
	"direction": "input",
	"parameters": {
	"n_obs_steps": 2,
	"observation_delta_indices": [
	-1,
	0
	]
	},
	"check_type": "static",
	"check_description": "verify n_obs_steps and delta_indices match config"
	},
	{
	"order": 9,
	"name": "clip_tokenize_text",
	"applies_to": "text",
	"operation": "tokenize",
	"direction": "input",
	"parameters": {
	"tokenizer": "openai/clip-vit-base-patch16",
	"max_length": 77
	},
	"check_type": "static",
	"check_description": "verify tokenizer model and max_length"
	},
	{
	"order": 10,
	"name": "diffusion_forward",
	"applies_to": "all",
	"operation": "diffusion_inference",
	"direction": "input",
	"parameters": {
	"num_inference_steps": 20,
	"scheduler": "DDIM",
	"prediction_type": "epsilon",
	"clip_sample_range": 1.0
	},
	"check_type": "static",
	"check_description": "verify inference parameters match config.objective"
	},
	{
	"order": 11,
	"name": "ramen_unnormalize_actions",
	"applies_to": "action",
	"operation": "ramen_unnormalize",
	"direction": "output",
	"parameters": {
	"stats_file": "assets/ramen_stats.json",
	"inverse_of": "step 6"
	},
	"check_type": "static",
	"check_description": "verify uses same stats and mask as step 6"
	},
	{
	"order": 12,
	"name": "delta_to_absolute",
	"applies_to": "action",
	"operation": "delta_to_absolute",
	"direction": "output",
	"parameters": {
	"method": "add current observed state on delta_mask'd dims",
	"delta_mask": [
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	true,
	false,
	false,
	false,
	false,
	false,
	false,
	true
	]
	},
	"check_type": "static",
	"check_description": "verify dims match step 5 delta_mask"
	}
	],
	"reference_test_vector": null,
	"norm_round_trip_results": [],
	"known_issues": [
	{
	"id": "transformers_drift",
	"severity": "critical",
	"description": "transformers>=5.5.0 changes CLIP ViT key layout (text_model.encoder vs text_encoder), breaking weight loading",
	"workaround": "Pin transformers==5.4.0 in runtime environment",
	"check_type": "static"
	},
	{
	"id": "ros_pythonpath_leak",
	"severity": "warning",
	"description": "/opt/ros paths on PYTHONPATH break Python 3.12 imports (importlib compat)",
	"workaround": "Unset or filter PYTHONPATH before launching: export PYTHONPATH=$(echo $PYTHONPATH \| tr ':' '\\n' \| grep -v /opt/ros \| paste -sd ':')",
	"check_type": "static"
	},
	{
	"id": "ramen_stats_format",
	"severity": "warning",
	"description": "Checkpoint uses ramen_stats.json (RAMEN format), not dataset_stats.json (LeRobot format). Loading code must detect format and route accordingly.",
	"workaround": "Use stats_format='ramen' when constructing adapter; load_ramen_stats() handles the format.",
	"check_type": "static"
	}
	]
	}