{
  "schema_version": "0.2",
  "generated_by": {
    "tool": "manual-by-agent",
    "version": "0.2",
    "agent": "claude-opus-4.6"
  },
  "generated_at": "2026-04-28T12:06:09Z",
  "stack": "multitask_dit_policy",
  "input_contract": {
    "images": [
      {
        "key": "observation.images.front",
        "aliases": [],
        "raw_shape": [
          3,
          480,
          640
        ],
        "encoder_resize": [
          224,
          224
        ],
        "crop": null,
        "color_order": "RGB",
        "channel_layout": "CHW",
        "dtype": "float32",
        "value_range": [
          0.0,
          1.0
        ],
        "normalization": {
          "type": "MEAN_STD",
          "scope": "VISUAL",
          "applied_by": "observation_encoder.vision (CLIP ViT-B/16 .openai)"
        },
        "augmentations_in_training": [],
        "physical_mounting": "front-mounted exterior camera",
        "camera_serial": null,
        "camera_usb_path": null,
        "reference_frame_hash": null,
        "reference_frame_path": null
      },
      {
        "key": "observation.images.wrist",
        "aliases": [],
        "raw_shape": [
          3,
          480,
          640
        ],
        "encoder_resize": [
          224,
          224
        ],
        "crop": null,
        "color_order": "RGB",
        "channel_layout": "CHW",
        "dtype": "float32",
        "value_range": [
          0.0,
          1.0
        ],
        "normalization": {
          "type": "MEAN_STD",
          "scope": "VISUAL",
          "applied_by": "observation_encoder.vision (CLIP ViT-B/16 .openai)"
        },
        "augmentations_in_training": [],
        "physical_mounting": "wrist-mounted camera",
        "camera_serial": null,
        "camera_usb_path": null,
        "reference_frame_hash": null,
        "reference_frame_path": null
      }
    ],
    "state": {
      "total_dim": 16,
      "sub_keys": [
        {
          "key": "observation.state",
          "dim": 7,
          "convert_rotation": false,
          "note": "raw 7D joint state (no rotation conversion)"
        },
        {
          "key": "observation.eef_6d_pose",
          "dim": 9,
          "convert_rotation": true,
          "note": "3D translation + 6D rotation (rot6d) -- expanded from 6D source by rotation conversion"
        }
      ],
      "normalization": {
        "type": "RAMEN_MIN_MAX",
        "source": "assets/ramen_stats.json",
        "stats_dim": 16,
        "stats_fingerprint": {
          "file_sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500",
          "per_dim_q02_at_t0": [
            -1.145761489868164,
            0.00858306884765625,
            0.02193450927734375,
            -1.216334342956543,
            -0.12111854553222656,
            -0.6292438507080078,
            0.0008642768952995539,
            0.13416780531406403,
            -0.3562192916870117,
            0.02314358949661255,
            0.0190497525036335,
            -0.6936914324760437,
            -0.9954489469528198,
            0.11262556910514832,
            -0.05564267560839653,
            -0.1800755113363266
          ],
          "per_dim_q98_at_t0": [
            -0.06275272369384766,
            2.0300216674804688,
            1.7050046920776367,
            0.12226295471191406,
            0.3057527542114258,
            0.7852678298950195,
            0.06498069316148758,
            0.340210884809494,
            -0.04877800866961479,
            0.2023451328277588,
            0.9633501768112183,
            -0.021005695685744286,
            -0.030398914590477943,
            0.9935086965560913,
            0.9875925779342651,
            0.14838503301143646
          ]
        }
      }
    },
    "actions": {
      "total_dim": 17,
      "horizon": 32,
      "sub_keys": [
        {
          "key": "action",
          "dim": 7,
          "convert_rotation": false,
          "note": "raw 7D joint action (deltas, RAMEN-normalized)"
        },
        {
          "key": "action.eef_pose",
          "dim": 10,
          "convert_rotation": true,
          "note": "3D translation + 6D rotation + 1D gripper -- absolute pose in 6D rotation form, dims 10-15 unnormalized"
        }
      ],
      "norm_mask": [
        true,
        true,
        true,
        true,
        true,
        true,
        true,
        true,
        true,
        true,
        false,
        false,
        false,
        false,
        false,
        false,
        true
      ],
      "delta_dims": {
        "delta_mask": [
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          false,
          false,
          false,
          false,
          false,
          false,
          true
        ],
        "absolute_dims_reason": "dims 10-15 are 6D rotation (rot6d) passed through unchanged per dataset_schema.rot6d_slice == [10, 16]"
      },
      "normalization": {
        "type": "RAMEN_MIN_MAX",
        "source": "assets/ramen_stats.json",
        "stats_dim": 17,
        "stats_layout": "(H=32, D=17) per-timestep",
        "stats_fingerprint": {
          "file_sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500",
          "per_dim_q02_at_t0": [
            -0.03802967071533203,
            -0.06523323059082031,
            -0.059611137956380844,
            -0.049973487854003906,
            -0.0247955322265625,
            -0.058365821838378906,
            -0.019222989678382874,
            -0.011416382156312466,
            -0.012026323936879635,
            -0.022589389234781265,
            0.006374520715326071,
            -0.6977512240409851,
            -0.9967345595359802,
            0.13310591876506805,
            -0.06537134945392609,
            -0.17569203674793243,
            -0.003581584896892309
          ],
          "per_dim_q98_at_t0": [
            0.02593994140625,
            0.062180519104003906,
            0.051880836486816406,
            0.043107032775878906,
            0.026703834533691406,
            0.056458473205566406,
            0.017502864822745323,
            0.009446386247873306,
            0.012261614203453064,
            0.020949140191078186,
            0.8820995092391968,
            -0.014275601133704185,
            -0.05388924479484558,
            0.9945363998413086,
            0.9843086004257202,
            0.14365684986114502,
            0.06591955572366714
          ]
        }
      }
    },
    "language": {
      "tokenizer_class": "CLIPTokenizer",
      "tokenizer_version": "openai/clip-vit-base-patch16",
      "max_sequence_length": 77,
      "default_prompt": "build a block tower",
      "training_prompts": null
    },
    "temporal": {
      "n_obs_steps": 2,
      "observation_delta_indices": [
        -1,
        0
      ],
      "delta_timestamps": null,
      "control_rate_hz": null
    },
    "training_datasets": [
      {
        "repo": "pravsels/dit_block_tower_norm_fix",
        "commit": null,
        "version": null,
        "num_episodes": null,
        "total_frames": null,
        "episode_filter": null,
        "sampling_weight": null,
        "key_rename_map": {},
        "delta_timestamps_at_training": null,
        "contributes_to_norm_stats": true
      }
    ]
  },
  "model_identity": {
    "class_name": "MultiTaskDiTPolicy",
    "class_module": "multitask_dit_policy.model.model",
    "config_architectures": [],
    "resolved_via": "direct_import",
    "resolved_class_name": "MultiTaskDiTPolicy",
    "library_versions": {
      "torch": "2.10.0",
      "transformers": "5.4.0",
      "timm": "1.0.26",
      "safetensors": "0.7.0",
      "draccus": "0.10.0",
      "lerobot": "0.5.0",
      "multitask_dit_policy": "0.1.0"
    },
    "runtime_constraints": {
      "required_versions": {
        "transformers": "==5.4.0",
        "torch": ">=2.10.0",
        "timm": ">=1.0.26"
      },
      "required_python": ">=3.12,<3.13",
      "known_incompatible": [
        "transformers>=5.5.0 (CLIP key layout change breaks weight loading)"
      ]
    },
    "python_version": "3.12.3",
    "cuda_version": "12.8"
  },
  "model_internals": {
    "module_hierarchy": [
      {
        "name": "",
        "class": "MultiTaskDiTPolicy",
        "children": [
          {
            "name": "observation_encoder",
            "class": "ObservationEncoder"
          },
          {
            "name": "action_head",
            "class": "DiffusionActionHead"
          }
        ]
      }
    ],
    "parameters": {
      "summary": {
        "total_params": 335072273,
        "trainable_params": 335072273,
        "frozen_params": 0,
        "total_bytes": 1340286788,
        "dtype_breakdown": {
          "float32": 335072273
        }
      }
    },
    "buffers": [
      {
        "name": "observation_encoder.text_encoder.text_encoder.text_model.embeddings.position_ids",
        "shape": [
          1,
          77
        ],
        "dtype": "int64"
      }
    ],
    "state_dict": {
      "expected_keys_count": 569,
      "found_keys_count": 569,
      "missing_keys": [],
      "unexpected_keys": []
    },
    "pretrained_provenance": [
      {
        "submodule": "observation_encoder.vision",
        "source": "timm",
        "timm_string": "vit_base_patch16_clip_224.openai",
        "hf_revision": "timm/vit_base_patch16_clip_224.openai",
        "frozen_in_training": false,
        "lr_multiplier": 0.1
      },
      {
        "submodule": "observation_encoder.text",
        "source": "huggingface",
        "timm_string": null,
        "hf_revision": "57c216476eefef5ab752ec549e440a49ae4ae5f3",
        "frozen_in_training": false,
        "lr_multiplier": null
      }
    ],
    "quantization": {
      "scheme": "none",
      "per_tensor_scales": null
    },
    "forward_graph": {
      "forward_signature": null,
      "expected_input_keys": [
        "observation.images.front",
        "observation.images.wrist",
        "observation.state",
        "task"
      ],
      "sample_input_shapes": {
        "observation.images.front": [
          1,
          2,
          3,
          480,
          640
        ],
        "observation.images.wrist": [
          1,
          2,
          3,
          480,
          640
        ],
        "observation.state": [
          1,
          2,
          16
        ]
      },
      "sample_output_shapes": {
        "action": [
          1,
          32,
          17
        ]
      },
      "flops_estimate": null,
      "peak_memory_inference_b1_bytes": null
    },
    "numerical_health": {
      "determinism": {
        "passed": true,
        "max_abs_diff": 0.0,
        "loss_value": 2.032438039779663
      },
      "no_nan_inf": {
        "passed": true,
        "loss_value": 2.032438039779663,
        "action_n_nan": 0,
        "action_n_inf": 0
      },
      "dropout_in_eval": {
        "passed": true,
        "total_dropout_modules": 0,
        "still_training": []
      },
      "bn_running_stats_present": {
        "passed": true,
        "total_bn_modules": 0,
        "with_running_stats": 0
      }
    }
  },
  "output_spec": {
    "actions": {
      "layout": "mirrors input_contract.actions",
      "sub_keys": "see input_contract.actions.sub_keys",
      "horizon": 32,
      "control_rate_hz": null,
      "action_latency_budget_ms": null
    },
    "auxiliary_outputs": {
      "reward_head": null,
      "value_head": null,
      "latents_exposed": false,
      "attention_maps_exposed": false
    },
    "inference_parameters": {
      "type": "diffusion",
      "num_inference_steps": 20,
      "scheduler": "DDIM",
      "prediction_type": "epsilon",
      "clip_sample": true,
      "clip_sample_range": 1.0,
      "chunk_aggregation": "first_n_action_steps",
      "chunks_executed_per_inference": 32,
      "extra": {
        "num_train_timesteps": 100,
        "beta_schedule": "squaredcos_cap_v2",
        "ramen_clip_value": 1.5
      }
    },
    "post_processing": {
      "unnormalize": true,
      "delta_to_absolute": {
        "applies_to_dims": "0-9, 16 (RAMEN delta dims)",
        "method": "add to current observed state"
      },
      "action_smoothing": null,
      "action_clamping": null
    },
    "smoke_results": {
      "calibration_batch_source": "synthetic (torch.rand for images, torch.randn for state, fixed prompt 'build a block tower')",
      "calibration_batch_size": 2,
      "determinism": {
        "status": "pass",
        "max_abs_diff": 0.0,
        "method": "two forward passes with torch.manual_seed(0); compare loss tensors via torch.allclose-equivalent diff",
        "details": {}
      },
      "nan_inf": {
        "status": "pass",
        "n_nan": 0,
        "n_inf": 0,
        "samples_checked": 1088,
        "details": {}
      },
      "liveness": {
        "status": "pass",
        "std": 0.5793782472610474,
        "mean": 0.0665111392736435,
        "criterion": "action std > 1e-5",
        "details": {}
      },
      "distribution": {
        "status": null,
        "ratio_in_acceptable_range": true,
        "min": -0.9089961647987366,
        "max": 1.459722876548767,
        "mean": 0.0665111392736435,
        "std": 0.5793782472610474,
        "per_dim_mean": [
          0.48858821392059326,
          -0.6914477944374084,
          -0.3306178152561188,
          0.29667824506759644,
          0.6836938261985779,
          0.3917674720287323,
          -0.43627968430519104,
          -0.16753537952899933,
          0.06529225409030914,
          0.3296715021133423,
          0.62017822265625,
          0.1883048117160797,
          -0.624245285987854,
          0.04247837886214256,
          0.8990960717201233,
          0.14396880567073822,
          -0.7689023613929749
        ],
        "per_dim_std": [
          0.44592636823654175,
          0.18982458114624023,
          0.1246829554438591,
          0.21343328058719635,
          0.6306062936782837,
          0.15469537675380707,
          0.17833946645259857,
          0.29920586943626404,
          0.21930228173732758,
          0.12209426611661911,
          0.2748163640499115,
          0.5151214599609375,
          0.1576608568429947,
          0.4949493110179901,
          0.14426289498806,
          0.38943856954574585,
          0.14056310057640076
        ],
        "details": {}
      },
      "range_check": {
        "status": null,
        "in_expected_range": true,
        "expected_range": [
          -1.5,
          1.5
        ],
        "actual_min": -0.9089961647987366,
        "actual_max": 1.459722876548767,
        "rationale": "ramen_clip_value=1.5; outputs are RAMEN-normalized actions before unnormalization",
        "details": {}
      }
    }
  },
  "weight_integrity": {
    "weight_files": [
      {
        "path": "checkpoints/29000/params/model.safetensors",
        "sha256": "39a8ac32231cedcd6108c65464db51401babf811a489701e01c1772a448d2530",
        "size_bytes": 1340286788
      },
      {
        "path": "checkpoints/29000/params/config.json",
        "sha256": "f812cd06ce47c3564db0d839f86e5742bc954dba9f9d6b6b732a6f2dda8823a1",
        "size_bytes": 3037
      },
      {
        "path": "assets/ramen_stats.json",
        "sha256": "02814cc786653100a776e196218915e881ac59d35dbb56dd5d4b263e0a37d500",
        "size_bytes": 34327
      }
    ],
    "manifest_hash": null
  },
  "provenance": {
    "run_log_path": "https://wandb.ai/pravsels/dit_block_tower_norm_fix/runs/ksuxe451",
    "training_repo": "https://github.com/pravsels/multitask_dit_policy",
    "training_repo_commit": "af0a43a512841aa1f4d6bb2f93755e5358dca8cb",
    "config_snapshot_path": "checkpoints/29000/params/config.json",
    "merged_config_sha256": "f812cd06ce47c3564db0d839f86e5742bc954dba9f9d6b6b732a6f2dda8823a1",
    "parent_checkpoint": null,
    "parent_description": null
  },
  "transform_pipeline": [
    {
      "order": 1,
      "name": "resize_images",
      "applies_to": "all_images",
      "operation": "resize",
      "direction": "input",
      "parameters": {
        "target_size": [
          224,
          224
        ]
      },
      "check_type": "static",
      "check_description": "compare config.observation_encoder.vision.resize_shape"
    },
    {
      "order": 2,
      "name": "imagenet_normalize_images",
      "applies_to": "all_images",
      "operation": "imagenet_normalize",
      "direction": "input",
      "parameters": {
        "mean": [
          0.485,
          0.456,
          0.406
        ],
        "std": [
          0.229,
          0.224,
          0.225
        ]
      },
      "check_type": "static",
      "check_description": "verify ImageNet mean/std constants match CLIP encoder expectation"
    },
    {
      "order": 3,
      "name": "rot6d_expand_state",
      "applies_to": "state",
      "operation": "rotation_conversion",
      "direction": "input",
      "parameters": {
        "source_dim": 13,
        "target_dim": 16,
        "rot6d_slice": [
          10,
          16
        ],
        "source_repr": "rpy_6d",
        "target_repr": "rot6d_9d"
      },
      "check_type": "static",
      "check_description": "verify norm_mask + rot6d_slice match config.dataset_schema"
    },
    {
      "order": 4,
      "name": "ramen_normalize_state",
      "applies_to": "state",
      "operation": "ramen_normalize",
      "direction": "input",
      "parameters": {
        "stats_file": "assets/ramen_stats.json",
        "clip_value": 1.5,
        "exempt_dims": [
          10,
          11,
          12,
          13,
          14,
          15
        ]
      },
      "check_type": "static",
      "check_description": "verify stats file hash + q02/q98 fingerprint at t0"
    },
    {
      "order": 5,
      "name": "compute_delta_actions",
      "applies_to": "action",
      "operation": "delta",
      "direction": "input",
      "parameters": {
        "method": "action - state on norm_mask'd dims",
        "delta_mask": [
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          false,
          false,
          false,
          false,
          false,
          false,
          true
        ]
      },
      "check_type": "static",
      "check_description": "verify delta_dims mask matches norm_mask; absolute dims (10-15) are rot6d"
    },
    {
      "order": 6,
      "name": "ramen_normalize_actions",
      "applies_to": "action",
      "operation": "ramen_normalize",
      "direction": "input",
      "parameters": {
        "stats_file": "assets/ramen_stats.json",
        "stats_layout": "(H=32, D=17) per-timestep",
        "clip_value": 1.5,
        "exempt_dims": [
          10,
          11,
          12,
          13,
          14,
          15
        ]
      },
      "check_type": "static",
      "check_description": "verify stats file hash + per-timestep q02/q98 fingerprint"
    },
    {
      "order": 7,
      "name": "stack_cameras",
      "applies_to": "all_images",
      "operation": "stack_cameras",
      "direction": "input",
      "parameters": {
        "key_order": [
          "observation.images.front",
          "observation.images.wrist"
        ]
      },
      "check_type": "static",
      "check_description": "verify camera key order matches config.input_features ordering"
    },
    {
      "order": 8,
      "name": "temporal_stack",
      "applies_to": "all",
      "operation": "temporal_stack",
      "direction": "input",
      "parameters": {
        "n_obs_steps": 2,
        "observation_delta_indices": [
          -1,
          0
        ]
      },
      "check_type": "static",
      "check_description": "verify n_obs_steps and delta_indices match config"
    },
    {
      "order": 9,
      "name": "clip_tokenize_text",
      "applies_to": "text",
      "operation": "tokenize",
      "direction": "input",
      "parameters": {
        "tokenizer": "openai/clip-vit-base-patch16",
        "max_length": 77
      },
      "check_type": "static",
      "check_description": "verify tokenizer model and max_length"
    },
    {
      "order": 10,
      "name": "diffusion_forward",
      "applies_to": "all",
      "operation": "diffusion_inference",
      "direction": "input",
      "parameters": {
        "num_inference_steps": 20,
        "scheduler": "DDIM",
        "prediction_type": "epsilon",
        "clip_sample_range": 1.0
      },
      "check_type": "static",
      "check_description": "verify inference parameters match config.objective"
    },
    {
      "order": 11,
      "name": "ramen_unnormalize_actions",
      "applies_to": "action",
      "operation": "ramen_unnormalize",
      "direction": "output",
      "parameters": {
        "stats_file": "assets/ramen_stats.json",
        "inverse_of": "step 6"
      },
      "check_type": "static",
      "check_description": "verify uses same stats and mask as step 6"
    },
    {
      "order": 12,
      "name": "delta_to_absolute",
      "applies_to": "action",
      "operation": "delta_to_absolute",
      "direction": "output",
      "parameters": {
        "method": "add current observed state on delta_mask'd dims",
        "delta_mask": [
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          true,
          false,
          false,
          false,
          false,
          false,
          false,
          true
        ]
      },
      "check_type": "static",
      "check_description": "verify dims match step 5 delta_mask"
    }
  ],
  "reference_test_vector": null,
  "norm_round_trip_results": [],
  "known_issues": [
    {
      "id": "transformers_drift",
      "severity": "critical",
      "description": "transformers>=5.5.0 changes CLIP ViT key layout (text_model.encoder vs text_encoder), breaking weight loading",
      "workaround": "Pin transformers==5.4.0 in runtime environment",
      "check_type": "static"
    },
    {
      "id": "ros_pythonpath_leak",
      "severity": "warning",
      "description": "/opt/ros paths on PYTHONPATH break Python 3.12 imports (importlib compat)",
      "workaround": "Unset or filter PYTHONPATH before launching: export PYTHONPATH=$(echo $PYTHONPATH | tr ':' '\\n' | grep -v /opt/ros | paste -sd ':')",
      "check_type": "static"
    },
    {
      "id": "ramen_stats_format",
      "severity": "warning",
      "description": "Checkpoint uses ramen_stats.json (RAMEN format), not dataset_stats.json (LeRobot format). Loading code must detect format and route accordingly.",
      "workaround": "Use stats_format='ramen' when constructing adapter; load_ramen_stats() handles the format.",
      "check_type": "static"
    }
  ]
}