Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

stage2_front_cam_step20000/HF_MODEL_CARD.md +33 -0
stage2_front_cam_step20000/README.md +37 -0
stage2_front_cam_step20000/SHA256SUMS +3 -0
stage2_front_cam_step20000/checkpoint_metadata.json +26 -0
stage2_front_cam_step20000/checkpoints/epoch=0-step=20000.ckpt +3 -0
stage2_front_cam_step20000/dataset_source.txt +2 -0
stage2_front_cam_step20000/training_config_snapshot.yaml +207 -0

stage2_front_cam_step20000/HF_MODEL_CARD.md ADDED Viewed

	@@ -0,0 +1,33 @@

+---
+tags:
+  - pytorch
+  - world-model
+  - robotics
+license: mit
+---
+# Interactive World Sim Checkpoints
+This repo hosts released checkpoint artifacts.
+Latest uploaded artifact:
+- `stage2_front_cam_step20000`
+- job: `2875089`
+- W&B: [Run `7skk0qh6`](https://wandb.ai/pravsels/interactive_world_sim/runs/7skk0qh6)
+- note: run ended early after NaN-gradient event; checkpoint is kept as the current stage-2 baseline.
+## Files in `stage2_front_cam_step20000/`
+- `checkpoints/epoch=0-step=20000.ckpt`
+- `training_config_snapshot.yaml` (exact Hydra snapshot used for this run)
+- `dataset_source.txt` (maps in-run mounted dataset paths to source paths)
+- `checkpoint_metadata.json`
+- `SHA256SUMS`
+- `README.md`
+## Previous artifact
+- `stage1_front_cam_step64000` ([Run `7gximny3`](https://wandb.ai/pravsels/interactive_world_sim/runs/7gximny3))
+Use `SHA256SUMS` to verify artifact integrity after download.

stage2_front_cam_step20000/README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+# Stage 2 Front-Cam Checkpoint (Isambard)
+This folder packages the stage-2 latent dynamics checkpoint from the first high-throughput run segment.
+## Checkpoint
+- file: `checkpoints/epoch=0-step=20000.ckpt`
+- size_bytes: `232088523`
+- sha256: `9f972dc4a805248d47c03f48a3cc1e4dbb3c85783e6ddbb58d2cdbfc5d4045e2`
+## Training Context
+- project: `interactive_world_sim`
+- cluster: `Isambard (GH200, arm64)`
+- dataset: `WAN H5` (`camera_1_color` front cam)
+- training_stage: `2` (latent dynamics)
+- training precision: `16-mixed`
+- training batch size: `32`
+- train/val dataloader workers: `8/8`
+- wandb mode during training: `offline` (then synced)
+## Provenance
+- source run block: `job 2875089`
+- synced W&B run:
+  [Run `7skk0qh6`](https://wandb.ai/pravsels/interactive_world_sim/runs/7skk0qh6)
+- key loss trend:
+  `training/loss: 0.014964337 -> 3.7573023e-05` (min `2.3631907e-05`, global_step `99 -> 22999`)
+- exact run config snapshot:
+  `training_config_snapshot.yaml`
+- dataset source mapping:
+  `dataset_source.txt`
+## Notes
+- Run ended before the configured max-steps after `NaN in gradient of module.out.1.weight` was emitted in Slurm logs.
+- This checkpoint is the recommended stage-2 baseline for downstream evaluation and planning.

stage2_front_cam_step20000/SHA256SUMS ADDED Viewed

	@@ -0,0 +1,3 @@

+9f972dc4a805248d47c03f48a3cc1e4dbb3c85783e6ddbb58d2cdbfc5d4045e2  checkpoints/epoch=0-step=20000.ckpt
+99d3ff7fcd5abb5740beefb604edfd9344389ad854d1d0172ca75bb3b0a87f3c  training_config_snapshot.yaml
+a9f0d2b0440888863b90678211088d2d632f902ea404b49272337e4e337a33c1  dataset_source.txt

stage2_front_cam_step20000/checkpoint_metadata.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "project": "interactive_world_sim",
+  "artifact_name": "stage2_front_cam_step20000",
+  "file_name": "checkpoints/epoch=0-step=20000.ckpt",
+  "size_bytes": 232088523,
+  "sha256": "9f972dc4a805248d47c03f48a3cc1e4dbb3c85783e6ddbb58d2cdbfc5d4045e2",
+  "training_stage": 2,
+  "obs_keys": [
+    "camera_1_color"
+  ],
+  "training_config_snapshot": {
+    "file": "training_config_snapshot.yaml",
+    "size_bytes": 5118,
+    "sha256": "99d3ff7fcd5abb5740beefb604edfd9344389ad854d1d0172ca75bb3b0a87f3c"
+  },
+  "dataset_source_mapping_file": "dataset_source.txt",
+  "source_job_id": "2875089",
+  "wandb_run_url": "https://wandb.ai/pravsels/interactive_world_sim/runs/7skk0qh6",
+  "final_training_loss": 3.7573023e-05,
+  "min_training_loss": 2.3631907e-05,
+  "global_step_first_last": [
+    99,
+    22999
+  ],
+  "run_outcome_note": "Ended early after NaN gradient event in Slurm logs."
+}

stage2_front_cam_step20000/checkpoints/epoch=0-step=20000.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f972dc4a805248d47c03f48a3cc1e4dbb3c85783e6ddbb58d2cdbfc5d4045e2
+size 232088523

stage2_front_cam_step20000/dataset_source.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ /mnt/wan_dataset.h5 -> /scratch/u6cr/pravsels.u6cr/latent_safety/arx5_datasets_6Feb_26_wan224.h5
2	+ /mnt/wan_dataset_stats.json -> /scratch/u6cr/pravsels.u6cr/latent_safety/arx5_datasets_6Feb_26_stats.json

stage2_front_cam_step20000/training_config_snapshot.yaml ADDED Viewed

	@@ -0,0 +1,207 @@

+experiment:
+  debug: ${debug}
+  tasks:
+  - training
+  num_nodes: 1
+  num_devices: 1
+  training:
+    precision: 16-mixed
+    compile: false
+    lr: 8.0e-05
+    batch_size: 32
+    max_epochs: -1
+    max_steps: 1000005
+    max_time: null
+    data:
+      num_workers: 8
+      shuffle: true
+    optim:
+      accumulate_grad_batches: 1
+      gradient_clip_val: 1.0
+    checkpointing:
+      every_n_train_steps: 10000
+      every_n_epochs: null
+      train_time_interval: null
+      enable_version_counter: false
+    log_every_n_steps: 100
+  validation:
+    precision: 16-mixed
+    compile: false
+    batch_size: 2
+    val_every_n_step: 30000
+    val_every_n_epoch: null
+    limit_batch: 1.0
+    inference_mode: true
+    data:
+      num_workers: 8
+      shuffle: false
+  test:
+    precision: 16-mixed
+    compile: false
+    batch_size: 8
+    limit_batch: 1
+    data:
+      num_workers: 16
+      shuffle: false
+  logging:
+    metrics:
+    - fvd
+dataset:
+  debug: ${debug}
+  h5_path: /mnt/wan_dataset.h5
+  dataset_dir: .
+  horizon: 10
+  val_horizon: 200
+  aug_mode: none
+  skip_frame: 1
+  pad_after: 7
+  pad_before: 1
+  seed: 42
+  val_ratio: 0.1
+  skip_idx: 1
+  resolution: 128
+  goal_sample: intermediate
+  stats_json_path: /mnt/wan_dataset_stats.json
+  action_key: actions_delta
+  state_key: states
+  camera_key_map:
+    camera_0_color: camera_0
+    camera_1_color: camera_1
+  lowdim_key_map:
+    joint_pos: states
+  obs_keys:
+  - camera_1_color
+  low_dim_keys: []
+  shape_meta:
+    action:
+      shape:
+      - 7
+    obs:
+      camera_0_color:
+        shape:
+        - 3
+        - 128
+        - 128
+        type: rgb
+      camera_1_color:
+        shape:
+        - 3
+        - 128
+        - 128
+        type: rgb
+algorithm:
+  debug: ${debug}
+  lr: ${experiment.training.lr}
+  weight_decay: 0.0001
+  warmup_steps: 10000
+  lr_scheduler: linear
+  optimizer_beta:
+  - 0.9
+  - 0.999
+  latent_dim: 512
+  action_dim: 7
+  enc_dim: 64
+  num_components: 1
+  obs_keys: ${dataset.obs_keys}
+  x_shape:
+  - ${eval:'3 * len(${dataset.obs_keys})'}
+  - ${dataset.resolution}
+  - ${dataset.resolution}
+  norm_scale: 6.0
+  num_latent_downsample: 2
+  num_views: ${eval:'len(${dataset.obs_keys})'}
+  num_latent_channel: ${eval:'4 * ${algorithm.num_views}'}
+  latent_resolution: ${eval:'${dataset.resolution} // int(2 ** ${algorithm.num_latent_downsample})'}
+  training_stage: 2
+  load_ae: /workspace/outputs/2026-03-13/10-25-30/checkpoints/epoch=0-step=64000.ckpt
+  dtype: ${torch:float}
+  mask_prev_action: false
+  device: cuda
+  noise_level: log_normal
+  val_render: false
+  scheduling_matrix: autoregressive
+  uncertainty_scale: 1.0
+  guidance_scale: 1.0
+  n_frames: ${dataset.horizon}
+  dyn_infer_steps: 1
+  dec_infer_steps: 3
+  last_frame_loss_only: false
+  prev_frame_noise_scale: 0.1
+  robust_latent: false
+  delta: ${eval:'0.00054 * ${algorithm.num_latent_channel} * ${algorithm.latent_resolution}
+    * ${algorithm.latent_resolution}'}
+  sampling_strategy: terminal_only
+  sampling_strategy_params: []
+  dynamics:
+    _target_: interactive_world_sim.algorithms.latent_dynamics.models.cm_latent_dynamics.CMLatentDynamics
+    action_dim: ${algorithm.action_dim}
+    latent_dim: ${algorithm.num_latent_channel}
+    dim: 64
+    action_emb_dim: 512
+    resnet_block_groups: 8
+    dim_mults:
+    - 1
+    - 2
+    attn_dim_head: 128
+    attn_heads: 4
+    use_linear_attn: true
+    use_init_temporal_attn: true
+    init_kernel_size: 5
+    is_causal: true
+    time_emb_type: rotary
+    dtype: ${algorithm.dtype}
+  noise_scheduler:
+    _target_: interactive_world_sim.utils.cm_utils.DDPMScheduler
+    x_shape: ${algorithm.x_shape}
+    timesteps: ${algorithm.diffusion.timesteps}
+    sampling_timesteps: ${algorithm.diffusion.sampling_timesteps}
+    beta_schedule: ${algorithm.diffusion.beta_schedule}
+    schedule_fn_kwargs: ${algorithm.diffusion.schedule_fn_kwargs}
+    objective: ${algorithm.diffusion.objective}
+    loss_weighting: uniform
+    snr_clip: ${algorithm.diffusion.snr_clip}
+    cum_snr_decay: ${algorithm.diffusion.cum_snr_decay}
+    ddim_sampling_eta: ${algorithm.diffusion.ddim_sampling_eta}
+    clip_noise: ${algorithm.diffusion.clip_noise}
+    stabilization_level: ${algorithm.diffusion.stabilization_level}
+    dtype: ${algorithm.dtype}
+  diffusion:
+    beta_schedule: sigmoid
+    objective: pred_v
+    use_fused_snr: true
+    cum_snr_decay: 0.96
+    clip_noise: 6.0
+    schedule_fn_kwargs: {}
+    timesteps: 1000
+    sampling_timesteps: 50
+    ddim_sampling_eta: 0.0
+    snr_clip: 5.0
+    model_channels: ${algorithm.enc_dim}
+    num_latent_downsample: ${algorithm.num_latent_downsample}
+    num_latent_channel: ${algorithm.num_latent_channel}
+    num_res_blocks: 2
+    attention_resolutions:
+    - 2
+    - 4
+    - 8
+    dropout: 0.1
+    channel_mult:
+    - 1
+    - 2
+    - 3
+    num_head_channels: 64
+    resblock_updown: true
+    use_scale_shift_norm: true
+    num_components: ${algorithm.num_components}
+    image_size: ${dataset.resolution}
+    stabilization_level: 15
+  metrics:
+  - fvd
+debug: false
+wandb:
+  entity: pravsels
+  project: interactive_world_sim
+  mode: offline
+resume: null
+load: null
+name: stage2_front_cam