diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..b09813554542b75455cc2a001025f4753394bb63 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.375, + "bag_proxy": 0.4583333333333333, + "cloth_proxy": 0.5833333333333334 + }, + "mean_success": 0.47222222222222215, + "visibility_integral": 37.36026926173104, + "corridor_availability": 0.8730104863643646, + "reocclusion_rate": 0.04405864197530864, + "persistence_horizon_mae": 1.033145775666108, + "disturbance_cost": 0.3228136783000082 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..9e6a9ef36c8d466deeea73d78155369b26115e71 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/checkpoint_best.pt +- mean_success: 0.472 +- visibility_integral: 37.360 +- corridor_availability: 0.873 +- reocclusion_rate: 0.044 +- persistence_horizon_mae: 1.033 +- disturbance_cost: 0.323 +- foliage_proxy_success: 0.375 +- bag_proxy_success: 0.458 +- cloth_proxy_success: 0.583 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eff928d59cc5d589c638a4a1ee7b58917509734b --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/config_resolved.yaml @@ -0,0 +1,149 @@ +experiment_name: proxy_interaction_r3d_stage1_clip_seed7 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 7 +init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt +init_strict: false +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 224 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage1_seed7.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage1_seed7.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 7 +optim: + epochs: 4 + batch_size: 2 + num_workers: 4 + lr: 0.0003 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: true + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 512 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: false + fusion: + hidden_dim: 512 + num_cameras: 3 + num_layers: 4 + num_heads: 8 + ff_dim: 2048 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 512 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 8 + max_history_steps: 8 + decoder: + hidden_dim: 512 + num_heads: 8 + num_layers: 4 + ff_dim: 2048 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 512 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 8 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 512 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 8 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 512 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 8 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.1 + arm_role: 0.15 + support_mode: 0.1 + corridor: 0.15 + persistence: 0.05 + disturbance: 0.05 + world_model: 0.2 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.25 + planner_risk: 0.1 + planner_ranking: 0.2 + proposal_reconstruction: 0.1 + proposal_success: 0.15 + proposal_ranking: 0.2 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..02afb7ed5c23f00d1758269377baba5349de1002 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.25396825396825395, + "planner_regret": 0.024764427915215492, + "planner_score_utility_spearman": 0.1904761791229248, + "risk_calibration_mse": 0.010364258661866188, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.022177213802933693, + "left_right_equivariance_error": 0.0002942846322184778, + "belief_calibration_brier": 0.003581121563911438, + "reocclusion_calibration_brier": 0.23373088240623474, + "support_stability_mae": 0.022998232394456863, + "clearance_auc": 0.8989269585276155, + "memory_write_rate": 0.0, + "memory_saturation": 0.41934600472450256, + "num_samples": 126 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..d16d4bc573f9087e2da1899ccba3528521fdbb9f --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.025519870977400175, + "arm_role": 0.03451829462151253, + "belief": 0.11532339149432656, + "clearance": 0.09198410963122758, + "corridor": 0.27232400180664673, + "disturbance": 0.005858588227789626, + "grasp_affordance": 0.018751464233153464, + "occluder_contact": 0.21359099159065967, + "persistence": 5.231568055785678, + "phase": 0.7372311896678665, + "planner_ranking": 0.1646315749647481, + "planner_risk": 0.014348083711473067, + "planner_success": 0.6091769787029446, + "proposal_diversity": 0.0, + "proposal_ranking": 1.253575401780493, + "proposal_reconstruction": 0.067724266230904, + "proposal_success": 0.6851897648491785, + "reocclusion": 0.7031442959895309, + "role_swap_consistency": 0.00044027801038677857, + "support_mode": 0.7282283443430956, + "support_stability": 0.15459337279551627, + "total": 1.6319934494832424, + "uncertainty": 0.013496716971069097, + "visibility": 0.11563199924314833, + "world_model": 2.671503098223222 + }, + "val": { + "action": 0.020692157455616526, + "arm_role": 9.546122843554865e-05, + "belief": 0.09874132736807778, + "clearance": 0.08244451738539196, + "corridor": 0.2306106292775699, + "disturbance": 0.006118982125097694, + "grasp_affordance": 0.009981726739732992, + "occluder_contact": 0.19720953915800368, + "persistence": 3.8672617465730696, + "phase": 0.668701058815396, + "planner_ranking": 0.03794538755975072, + "planner_risk": 0.009814016923349026, + "planner_success": 0.5628143776030767, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1249213124078417, + "proposal_reconstruction": 0.06329423224642164, + "proposal_success": 0.6747160203873165, + "reocclusion": 0.692203164100647, + "role_swap_consistency": 0.0, + "support_mode": 0.6680677216204386, + "support_stability": 0.1511912994411966, + "total": 1.358805573175824, + "uncertainty": 0.003482046378185115, + "visibility": 0.10417925601913816, + "world_model": 2.1376701915074907 + } + }, + { + "epoch": 1, + "train": { + "action": 0.02150821143575988, + "arm_role": 1.9482293054071397e-05, + "belief": 0.09863162136280725, + "clearance": 0.08064276829400924, + "corridor": 0.24359133383210416, + "disturbance": 0.002735878452234476, + "grasp_affordance": 0.009349104797184779, + "occluder_contact": 0.1937003313558888, + "persistence": 4.076787073262699, + "phase": 0.6966290698625655, + "planner_ranking": 0.04271617977273956, + "planner_risk": 0.010049402082938681, + "planner_success": 0.5399472568359674, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1569982820156357, + "proposal_reconstruction": 0.06389496966962414, + "proposal_success": 0.6711133328407847, + "reocclusion": 0.6940537130957498, + "role_swap_consistency": 0.00022550253765151655, + "support_mode": 0.6837139029777487, + "support_stability": 0.14029162690160474, + "total": 1.3837347957476271, + "uncertainty": 0.0016494125736687157, + "visibility": 0.09400421737922424, + "world_model": 2.175609592991974 + }, + "val": { + "action": 0.020051477757829523, + "arm_role": 2.626385377793451e-06, + "belief": 0.09183884199176516, + "clearance": 0.07657587877105153, + "corridor": 0.22728621321065084, + "disturbance": 0.0016498260886850951, + "grasp_affordance": 0.009590831518705403, + "occluder_contact": 0.1917984854607355, + "persistence": 3.699212070495363, + "phase": 0.6689459842348856, + "planner_ranking": 0.03331218510795715, + "planner_risk": 0.010092773325076061, + "planner_success": 0.5014436940352122, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1606994933552213, + "proposal_reconstruction": 0.062439400820978104, + "proposal_success": 0.675733851061927, + "reocclusion": 0.6921006942552234, + "role_swap_consistency": 0.0, + "support_mode": 0.6564426545112853, + "support_stability": 0.14099458102432508, + "total": 1.313369631767273, + "uncertainty": 0.0024020517326240973, + "visibility": 0.08723713226971172, + "world_model": 2.0216772158940635 + } + }, + { + "epoch": 2, + "train": { + "action": 0.018980447901412845, + "arm_role": 2.3090714559505124e-05, + "belief": 0.1100015923263827, + "clearance": 0.0791148773262872, + "corridor": 0.23030528037001852, + "disturbance": 0.002447301701405857, + "grasp_affordance": 0.009001106255400087, + "occluder_contact": 0.21010415864552504, + "persistence": 2.0494745795430753, + "phase": 0.459073231482381, + "planner_ranking": 0.036845811475892686, + "planner_risk": 0.011261017404920885, + "planner_success": 0.5133467099741491, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1499755538570944, + "proposal_reconstruction": 0.062038555780318395, + "proposal_success": 0.6672172468370168, + "reocclusion": 0.41151915600825667, + "role_swap_consistency": 0.0007739521978125561, + "support_mode": 0.38595684411013936, + "support_stability": 0.1425538511912665, + "total": 1.1811942648513154, + "uncertainty": 0.000767841034371724, + "visibility": 0.10209987125315591, + "world_model": 2.070929214904446 + }, + "val": { + "action": 0.0138629823627453, + "arm_role": 0.002011558223822855, + "belief": 0.10340341582657799, + "clearance": 0.0855481999497565, + "corridor": 0.2235906974427284, + "disturbance": 0.0011637268657111797, + "grasp_affordance": 0.010592727485807642, + "occluder_contact": 0.20843842601965343, + "persistence": 1.1762515253254346, + "phase": 0.3442955078771486, + "planner_ranking": 0.03461442932137519, + "planner_risk": 0.01165175854065825, + "planner_success": 0.45808544967855724, + "proposal_diversity": 0.0, + "proposal_ranking": 1.3026971003365895, + "proposal_reconstruction": 0.05888378312663427, + "proposal_success": 0.7430036550476438, + "reocclusion": 0.2871374910076459, + "role_swap_consistency": 0.0, + "support_mode": 0.22473623181900215, + "support_stability": 0.1320991822414928, + "total": 1.1099917330439129, + "uncertainty": 0.0005805234163528352, + "visibility": 0.09557991185122067, + "world_model": 1.9994045325687952 + } + }, + { + "epoch": 3, + "train": { + "action": 0.014569098466314883, + "arm_role": 4.4951576212937916e-05, + "belief": 0.09620984569582015, + "clearance": 0.07538617284315106, + "corridor": 0.21248489566188775, + "disturbance": 0.0016758848629270635, + "grasp_affordance": 0.008272631588777167, + "occluder_contact": 0.19746327033529731, + "persistence": 1.1089699098374644, + "phase": 0.3716845961765469, + "planner_ranking": 0.03254403228879829, + "planner_risk": 0.010248634800575772, + "planner_success": 0.47941413580279074, + "proposal_diversity": 0.0, + "proposal_ranking": 1.153262345578658, + "proposal_reconstruction": 0.05860933205064055, + "proposal_success": 0.6466394141706496, + "reocclusion": 0.2566672772173989, + "role_swap_consistency": 0.0010398222479868085, + "support_mode": 0.21815690070546734, + "support_stability": 0.13650912478449145, + "total": 1.0633102330861914, + "uncertainty": 0.0002461711761398012, + "visibility": 0.09588275449984361, + "world_model": 1.9903733518111144 + }, + "val": { + "action": 0.01619998768474611, + "arm_role": 3.844006559777174e-06, + "belief": 0.09427393618084136, + "clearance": 0.07296533326780985, + "corridor": 0.2100035525148823, + "disturbance": 0.0013519242122204862, + "grasp_affordance": 0.007646961093303703, + "occluder_contact": 0.1950870676646157, + "persistence": 1.3894045449024628, + "phase": 0.6804814789192899, + "planner_ranking": 0.027768202883649677, + "planner_risk": 0.010219628483081044, + "planner_success": 0.4819766197885786, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1241777983922807, + "proposal_reconstruction": 0.060782825840370994, + "proposal_success": 0.6369421221907177, + "reocclusion": 0.27461627113913734, + "role_swap_consistency": 0.0, + "support_mode": 0.08716485598531093, + "support_stability": 0.13245442648610425, + "total": 1.0629130696493483, + "uncertainty": 8.45672577761145e-05, + "visibility": 0.1013997554306, + "world_model": 1.8573077273747278 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..794d761bcb9fe58941b4f435665e75eb6f536b98 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/summary.json @@ -0,0 +1,557 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage1_clip_seed7", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed7/checkpoint_best.pt", + "final_train_total": 1.0633102330861914, + "final_val_total": 1.0629130696493483, + "train_time_sec": 174.85308933258057, + "peak_gpu_memory_mb": 1919.8251953125, + "num_train_samples": 382, + "num_val_samples": 126, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": { + "path": "/workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt", + "loaded_keys": 461, + "skipped_shape_mismatch_keys": [ + "memory.gru.weight_ih_l0", + "memory.gru.weight_hh_l0", + "memory.gru.bias_ih_l0", + "memory.gru.bias_hh_l0", + "memory.token_proj.0.weight", + "memory.token_proj.0.bias", + "memory.token_proj.1.weight", + "memory.token_proj.1.bias", + "decoder.actor_role_bias", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.linear1.weight", + "decoder.revealer_decoder.layers.0.linear1.bias", + "decoder.revealer_decoder.layers.0.linear2.weight", + "decoder.revealer_decoder.layers.0.linear2.bias", + "decoder.revealer_decoder.layers.0.norm1.weight", + "decoder.revealer_decoder.layers.0.norm1.bias", + "decoder.revealer_decoder.layers.0.norm2.weight", + "decoder.revealer_decoder.layers.0.norm2.bias", + "decoder.revealer_decoder.layers.0.norm3.weight", + "decoder.revealer_decoder.layers.0.norm3.bias", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.linear1.weight", + "decoder.revealer_decoder.layers.1.linear1.bias", + "decoder.revealer_decoder.layers.1.linear2.weight", + "decoder.revealer_decoder.layers.1.linear2.bias", + "decoder.revealer_decoder.layers.1.norm1.weight", + "decoder.revealer_decoder.layers.1.norm1.bias", + "decoder.revealer_decoder.layers.1.norm2.weight", + "decoder.revealer_decoder.layers.1.norm2.bias", + "decoder.revealer_decoder.layers.1.norm3.weight", + "decoder.revealer_decoder.layers.1.norm3.bias", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.linear1.weight", + "decoder.revealer_decoder.layers.2.linear1.bias", + "decoder.revealer_decoder.layers.2.linear2.weight", + "decoder.revealer_decoder.layers.2.linear2.bias", + "decoder.revealer_decoder.layers.2.norm1.weight", + "decoder.revealer_decoder.layers.2.norm1.bias", + "decoder.revealer_decoder.layers.2.norm2.weight", + "decoder.revealer_decoder.layers.2.norm2.bias", + "decoder.revealer_decoder.layers.2.norm3.weight", + "decoder.revealer_decoder.layers.2.norm3.bias", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.linear1.weight", + "decoder.revealer_decoder.layers.3.linear1.bias", + "decoder.revealer_decoder.layers.3.linear2.weight", + "decoder.revealer_decoder.layers.3.linear2.bias", + "decoder.revealer_decoder.layers.3.norm1.weight", + "decoder.revealer_decoder.layers.3.norm1.bias", + "decoder.revealer_decoder.layers.3.norm2.weight", + "decoder.revealer_decoder.layers.3.norm2.bias", + "decoder.revealer_decoder.layers.3.norm3.weight", + "decoder.revealer_decoder.layers.3.norm3.bias", + "decoder.actor_decoder.layers.0.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.linear1.weight", + "decoder.actor_decoder.layers.0.linear1.bias", + "decoder.actor_decoder.layers.0.linear2.weight", + "decoder.actor_decoder.layers.0.linear2.bias", + "decoder.actor_decoder.layers.0.norm1.weight", + "decoder.actor_decoder.layers.0.norm1.bias", + "decoder.actor_decoder.layers.0.norm2.weight", + "decoder.actor_decoder.layers.0.norm2.bias", + "decoder.actor_decoder.layers.0.norm3.weight", + "decoder.actor_decoder.layers.0.norm3.bias", + "decoder.actor_decoder.layers.1.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.linear1.weight", + "decoder.actor_decoder.layers.1.linear1.bias", + "decoder.actor_decoder.layers.1.linear2.weight", + "decoder.actor_decoder.layers.1.linear2.bias", + "decoder.actor_decoder.layers.1.norm1.weight", + "decoder.actor_decoder.layers.1.norm1.bias", + "decoder.actor_decoder.layers.1.norm2.weight", + "decoder.actor_decoder.layers.1.norm2.bias", + "decoder.actor_decoder.layers.1.norm3.weight", + "decoder.actor_decoder.layers.1.norm3.bias", + "decoder.actor_decoder.layers.2.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.linear1.weight", + "decoder.actor_decoder.layers.2.linear1.bias", + "decoder.actor_decoder.layers.2.linear2.weight", + "decoder.actor_decoder.layers.2.linear2.bias", + "decoder.actor_decoder.layers.2.norm1.weight", + "decoder.actor_decoder.layers.2.norm1.bias", + "decoder.actor_decoder.layers.2.norm2.weight", + "decoder.actor_decoder.layers.2.norm2.bias", + "decoder.actor_decoder.layers.2.norm3.weight", + "decoder.actor_decoder.layers.2.norm3.bias", + "decoder.actor_decoder.layers.3.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.linear1.weight", + "decoder.actor_decoder.layers.3.linear1.bias", + "decoder.actor_decoder.layers.3.linear2.weight", + "decoder.actor_decoder.layers.3.linear2.bias", + "decoder.actor_decoder.layers.3.norm1.weight", + "decoder.actor_decoder.layers.3.norm1.bias", + "decoder.actor_decoder.layers.3.norm2.weight", + "decoder.actor_decoder.layers.3.norm2.bias", + "decoder.actor_decoder.layers.3.norm3.weight", + "decoder.actor_decoder.layers.3.norm3.bias", + "decoder.revealer_mean.weight", + "decoder.revealer_mean.bias", + "decoder.revealer_log_std.weight", + "decoder.revealer_log_std.bias", + "decoder.actor_mean.weight", + "decoder.actor_mean.bias", + "decoder.actor_log_std.weight", + "decoder.actor_log_std.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias" + ], + "missing_keys": [ + "backbone.depth_adapter.depth_proj.0.weight", + "backbone.depth_adapter.depth_proj.0.bias", + "backbone.depth_adapter.depth_proj.1.weight", + "backbone.depth_adapter.depth_proj.1.bias", + "backbone.depth_adapter.depth_proj.3.weight", + "backbone.depth_adapter.depth_proj.3.bias", + "backbone.depth_adapter.geometry_proj.0.weight", + "backbone.depth_adapter.geometry_proj.0.bias", + "backbone.depth_adapter.geometry_proj.1.weight", + "backbone.depth_adapter.geometry_proj.1.bias", + "backbone.depth_adapter.camera_proj.0.weight", + "backbone.depth_adapter.camera_proj.0.bias", + "backbone.depth_adapter.camera_proj.1.weight", + "backbone.depth_adapter.camera_proj.1.bias", + "fusion.geometry_fusion.attn.in_proj_weight", + "fusion.geometry_fusion.attn.in_proj_bias", + "fusion.geometry_fusion.attn.out_proj.weight", + "fusion.geometry_fusion.attn.out_proj.bias", + "fusion.geometry_fusion.gate.0.weight", + "fusion.geometry_fusion.gate.0.bias", + "fusion.geometry_fusion.gate.1.weight", + "fusion.geometry_fusion.gate.1.bias", + "fusion.geometry_fusion.gate.3.weight", + "fusion.geometry_fusion.gate.3.bias", + "fusion.geometry_fusion.out.0.weight", + "fusion.geometry_fusion.out.0.bias", + "fusion.geometry_fusion.out.1.weight", + "fusion.geometry_fusion.out.1.bias", + "memory.scene_memory.position_embedding", + "memory.scene_memory.bank_queries", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear1.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear1.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear2.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear2.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm1.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm1.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm2.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm2.bias", + "memory.scene_memory.bank_attention.in_proj_weight", + "memory.scene_memory.bank_attention.in_proj_bias", + "memory.scene_memory.bank_attention.out_proj.weight", + "memory.scene_memory.bank_attention.out_proj.bias", + "memory.scene_memory.action_proj.0.weight", + "memory.scene_memory.action_proj.0.bias", + "memory.scene_memory.action_proj.1.weight", + "memory.scene_memory.action_proj.1.bias", + "memory.scene_memory.write_gate.0.weight", + "memory.scene_memory.write_gate.0.bias", + "memory.scene_memory.write_gate.1.weight", + "memory.scene_memory.write_gate.1.bias", + "memory.scene_memory.write_gate.3.weight", + "memory.scene_memory.write_gate.3.bias", + "memory.scene_memory.token_proj.0.weight", + "memory.scene_memory.token_proj.0.bias", + "memory.scene_memory.token_proj.1.weight", + "memory.scene_memory.token_proj.1.bias", + "memory.belief_memory.position_embedding", + "memory.belief_memory.bank_queries", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear1.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear1.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear2.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear2.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm1.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm1.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm2.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm2.bias", + "memory.belief_memory.bank_attention.in_proj_weight", + "memory.belief_memory.bank_attention.in_proj_bias", + "memory.belief_memory.bank_attention.out_proj.weight", + "memory.belief_memory.bank_attention.out_proj.bias", + "memory.belief_memory.action_proj.0.weight", + "memory.belief_memory.action_proj.0.bias", + "memory.belief_memory.action_proj.1.weight", + "memory.belief_memory.action_proj.1.bias", + "memory.belief_memory.write_gate.0.weight", + "memory.belief_memory.write_gate.0.bias", + "memory.belief_memory.write_gate.1.weight", + "memory.belief_memory.write_gate.1.bias", + "memory.belief_memory.write_gate.3.weight", + "memory.belief_memory.write_gate.3.bias", + "memory.belief_memory.token_proj.0.weight", + "memory.belief_memory.token_proj.0.bias", + "memory.belief_memory.token_proj.1.weight", + "memory.belief_memory.token_proj.1.bias", + "decoder.arm_decoder.layers.0.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.linear1.weight", + "decoder.arm_decoder.layers.0.linear1.bias", + "decoder.arm_decoder.layers.0.linear2.weight", + "decoder.arm_decoder.layers.0.linear2.bias", + "decoder.arm_decoder.layers.0.norm1.weight", + "decoder.arm_decoder.layers.0.norm1.bias", + "decoder.arm_decoder.layers.0.norm2.weight", + "decoder.arm_decoder.layers.0.norm2.bias", + "decoder.arm_decoder.layers.0.norm3.weight", + "decoder.arm_decoder.layers.0.norm3.bias", + "decoder.arm_decoder.layers.1.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.linear1.weight", + "decoder.arm_decoder.layers.1.linear1.bias", + "decoder.arm_decoder.layers.1.linear2.weight", + "decoder.arm_decoder.layers.1.linear2.bias", + "decoder.arm_decoder.layers.1.norm1.weight", + "decoder.arm_decoder.layers.1.norm1.bias", + "decoder.arm_decoder.layers.1.norm2.weight", + "decoder.arm_decoder.layers.1.norm2.bias", + "decoder.arm_decoder.layers.1.norm3.weight", + "decoder.arm_decoder.layers.1.norm3.bias", + "decoder.arm_decoder.layers.2.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.linear1.weight", + "decoder.arm_decoder.layers.2.linear1.bias", + "decoder.arm_decoder.layers.2.linear2.weight", + "decoder.arm_decoder.layers.2.linear2.bias", + "decoder.arm_decoder.layers.2.norm1.weight", + "decoder.arm_decoder.layers.2.norm1.bias", + "decoder.arm_decoder.layers.2.norm2.weight", + "decoder.arm_decoder.layers.2.norm2.bias", + "decoder.arm_decoder.layers.2.norm3.weight", + "decoder.arm_decoder.layers.2.norm3.bias", + "decoder.arm_decoder.layers.3.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.linear1.weight", + "decoder.arm_decoder.layers.3.linear1.bias", + "decoder.arm_decoder.layers.3.linear2.weight", + "decoder.arm_decoder.layers.3.linear2.bias", + "decoder.arm_decoder.layers.3.norm1.weight", + "decoder.arm_decoder.layers.3.norm1.bias", + "decoder.arm_decoder.layers.3.norm2.weight", + "decoder.arm_decoder.layers.3.norm2.bias", + "decoder.arm_decoder.layers.3.norm3.weight", + "decoder.arm_decoder.layers.3.norm3.bias", + "decoder.arm_identity.weight", + "decoder.phase_adapter.weight", + "decoder.phase_adapter.bias", + "decoder.role_adapter.weight", + "decoder.role_adapter.bias", + "decoder.context_proj.0.weight", + "decoder.context_proj.0.bias", + "decoder.context_proj.1.weight", + "decoder.context_proj.1.bias", + "decoder.arm_head.0.weight", + "decoder.arm_head.0.bias", + "decoder.arm_head.1.weight", + "decoder.arm_head.1.bias", + "decoder.arm_mean.weight", + "decoder.arm_mean.bias", + "decoder.arm_log_std.weight", + "decoder.arm_log_std.bias", + "decoder.proposal_mode_head.0.weight", + "decoder.proposal_mode_head.0.bias", + "decoder.proposal_mode_head.1.weight", + "decoder.proposal_mode_head.1.bias", + "decoder.proposal_mode_head.3.weight", + "decoder.proposal_mode_head.3.bias", + "decoder.proposal_mode_embeddings.weight", + "decoder.proposal_slot_embeddings.weight", + "decoder.mode_residual_heads.0.0.weight", + "decoder.mode_residual_heads.0.0.bias", + "decoder.mode_residual_heads.0.1.weight", + "decoder.mode_residual_heads.0.1.bias", + "decoder.mode_residual_heads.0.3.weight", + "decoder.mode_residual_heads.0.3.bias", + "decoder.mode_residual_heads.1.0.weight", + "decoder.mode_residual_heads.1.0.bias", + "decoder.mode_residual_heads.1.1.weight", + "decoder.mode_residual_heads.1.1.bias", + "decoder.mode_residual_heads.1.3.weight", + "decoder.mode_residual_heads.1.3.bias", + "decoder.mode_residual_heads.2.0.weight", + "decoder.mode_residual_heads.2.0.bias", + "decoder.mode_residual_heads.2.1.weight", + "decoder.mode_residual_heads.2.1.bias", + "decoder.mode_residual_heads.2.3.weight", + "decoder.mode_residual_heads.2.3.bias", + "decoder.mode_residual_heads.3.0.weight", + "decoder.mode_residual_heads.3.0.bias", + "decoder.mode_residual_heads.3.1.weight", + "decoder.mode_residual_heads.3.1.bias", + "decoder.mode_residual_heads.3.3.weight", + "decoder.mode_residual_heads.3.3.bias", + "decoder.mode_residual_heads.4.0.weight", + "decoder.mode_residual_heads.4.0.bias", + "decoder.mode_residual_heads.4.1.weight", + "decoder.mode_residual_heads.4.1.bias", + "decoder.mode_residual_heads.4.3.weight", + "decoder.mode_residual_heads.4.3.bias", + "decoder.mode_residual_heads.5.0.weight", + "decoder.mode_residual_heads.5.0.bias", + "decoder.mode_residual_heads.5.1.weight", + "decoder.mode_residual_heads.5.1.bias", + "decoder.mode_residual_heads.5.3.weight", + "decoder.mode_residual_heads.5.3.bias", + "decoder.slot_delta.0.weight", + "decoder.slot_delta.0.bias", + "decoder.slot_delta.1.weight", + "decoder.slot_delta.1.bias", + "decoder.slot_delta.3.weight", + "decoder.slot_delta.3.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias", + "decoder.proposal_score.3.weight", + "decoder.proposal_score.3.bias", + "elastic_state_head.interaction_queries", + "elastic_state_head.interaction_attention.in_proj_weight", + "elastic_state_head.interaction_attention.in_proj_bias", + "elastic_state_head.interaction_attention.out_proj.weight", + "elastic_state_head.interaction_attention.out_proj.bias", + "elastic_state_head.interaction_mlp.0.weight", + "elastic_state_head.interaction_mlp.0.bias", + "elastic_state_head.interaction_mlp.1.weight", + "elastic_state_head.interaction_mlp.1.bias", + "elastic_state_head.interaction_mlp.3.weight", + "elastic_state_head.interaction_mlp.3.bias", + "elastic_state_head.decoder.field_queries", + "elastic_state_head.decoder.field_attention.in_proj_weight", + "elastic_state_head.decoder.field_attention.in_proj_bias", + "elastic_state_head.decoder.field_attention.out_proj.weight", + "elastic_state_head.decoder.field_attention.out_proj.bias", + "elastic_state_head.decoder.field_mlp.0.weight", + "elastic_state_head.decoder.field_mlp.0.bias", + "elastic_state_head.decoder.field_mlp.1.weight", + "elastic_state_head.decoder.field_mlp.1.bias", + "elastic_state_head.decoder.field_mlp.3.weight", + "elastic_state_head.decoder.field_mlp.3.bias", + "elastic_state_head.decoder.summary_proj.0.weight", + "elastic_state_head.decoder.summary_proj.0.bias", + "elastic_state_head.decoder.summary_proj.1.weight", + "elastic_state_head.decoder.summary_proj.1.bias", + "elastic_state_head.decoder.phase_head.0.weight", + "elastic_state_head.decoder.phase_head.0.bias", + "elastic_state_head.decoder.phase_head.1.weight", + "elastic_state_head.decoder.phase_head.1.bias", + "elastic_state_head.decoder.phase_head.3.weight", + "elastic_state_head.decoder.phase_head.3.bias", + "elastic_state_head.decoder.arm_role_head.0.weight", + "elastic_state_head.decoder.arm_role_head.0.bias", + "elastic_state_head.decoder.arm_role_head.1.weight", + "elastic_state_head.decoder.arm_role_head.1.bias", + "elastic_state_head.decoder.arm_role_head.3.weight", + "elastic_state_head.decoder.arm_role_head.3.bias", + "elastic_state_head.decoder.arm_identity.weight", + "elastic_state_head.decoder.support_mode.0.weight", + "elastic_state_head.decoder.support_mode.0.bias", + "elastic_state_head.decoder.support_mode.1.weight", + "elastic_state_head.decoder.support_mode.1.bias", + "elastic_state_head.decoder.support_mode.3.weight", + "elastic_state_head.decoder.support_mode.3.bias", + "elastic_state_head.decoder.access_field.weight", + "elastic_state_head.decoder.access_field.bias", + "elastic_state_head.decoder.target_belief_field.weight", + "elastic_state_head.decoder.target_belief_field.bias", + "elastic_state_head.decoder.visibility_field.weight", + "elastic_state_head.decoder.visibility_field.bias", + "elastic_state_head.decoder.clearance_field.weight", + "elastic_state_head.decoder.clearance_field.bias", + "elastic_state_head.decoder.occluder_contact_field.weight", + "elastic_state_head.decoder.occluder_contact_field.bias", + "elastic_state_head.decoder.grasp_affordance_field.weight", + "elastic_state_head.decoder.grasp_affordance_field.bias", + "elastic_state_head.decoder.support_stability_field.weight", + "elastic_state_head.decoder.support_stability_field.bias", + "elastic_state_head.decoder.persistence_field.weight", + "elastic_state_head.decoder.persistence_field.bias", + "elastic_state_head.decoder.reocclusion_field.weight", + "elastic_state_head.decoder.reocclusion_field.bias", + "elastic_state_head.decoder.disturbance_field.weight", + "elastic_state_head.decoder.disturbance_field.bias", + "elastic_state_head.decoder.uncertainty_field.weight", + "elastic_state_head.decoder.uncertainty_field.bias", + "elastic_state_head.decoder.reocclusion_head.0.weight", + "elastic_state_head.decoder.reocclusion_head.0.bias", + "elastic_state_head.decoder.reocclusion_head.1.weight", + "elastic_state_head.decoder.reocclusion_head.1.bias", + "elastic_state_head.decoder.reocclusion_head.3.weight", + "elastic_state_head.decoder.reocclusion_head.3.bias", + "world_model.state_encoder.0.weight", + "world_model.state_encoder.0.bias", + "world_model.state_encoder.1.weight", + "world_model.state_encoder.1.bias", + "world_model.scene_memory_proj.0.weight", + "world_model.scene_memory_proj.0.bias", + "world_model.scene_memory_proj.1.weight", + "world_model.scene_memory_proj.1.bias", + "world_model.belief_memory_proj.0.weight", + "world_model.belief_memory_proj.0.bias", + "world_model.belief_memory_proj.1.weight", + "world_model.belief_memory_proj.1.bias", + "world_model.action_encoder.0.weight", + "world_model.action_encoder.0.bias", + "world_model.action_encoder.1.weight", + "world_model.action_encoder.1.bias", + "world_model.transition.weight_ih", + "world_model.transition.weight_hh", + "world_model.transition.bias_ih", + "world_model.transition.bias_hh", + "world_model.scene_memory_update.weight", + "world_model.scene_memory_update.bias", + "world_model.belief_memory_update.weight", + "world_model.belief_memory_update.bias", + "world_model.compact_decoder.weight", + "world_model.compact_decoder.bias", + "world_model.target_belief_head.weight", + "world_model.target_belief_head.bias", + "world_model.visibility_head.weight", + "world_model.visibility_head.bias", + "world_model.clearance_head.weight", + "world_model.clearance_head.bias", + "world_model.occluder_contact_head.weight", + "world_model.occluder_contact_head.bias", + "world_model.grasp_affordance_head.weight", + "world_model.grasp_affordance_head.bias", + "world_model.support_stability_head.weight", + "world_model.support_stability_head.bias", + "world_model.persistence_head.weight", + "world_model.persistence_head.bias", + "world_model.reocclusion_head.weight", + "world_model.reocclusion_head.bias", + "world_model.disturbance_head.weight", + "world_model.disturbance_head.bias", + "world_model.uncertainty_head.weight", + "world_model.uncertainty_head.bias", + "world_model.access_head.weight", + "world_model.access_head.bias", + "planner.residual.trunk.0.weight", + "planner.residual.trunk.0.bias", + "planner.residual.trunk.1.weight", + "planner.residual.trunk.1.bias", + "planner.residual.trunk.3.weight", + "planner.residual.trunk.3.bias", + "planner.residual.success_head.weight", + "planner.residual.success_head.bias", + "planner.residual.risk_head.weight", + "planner.residual.risk_head.bias", + "planner.residual.residual_head.weight", + "planner.residual.residual_head.bias" + ], + "unexpected_keys": [] + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..78f463cae8a8270ef40d9dd9e7696812b95f2b69 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.5833333333333334, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5555555555555555, + "visibility_integral": 31.92372977733612, + "corridor_availability": 0.8500884034567409, + "reocclusion_rate": 0.029287114566719827, + "persistence_horizon_mae": 0.894922278028389, + "disturbance_cost": 0.28616168903600836 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..707fc6e70e75c11e0e87ac7960db0a33969301db --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/checkpoint_best.pt +- mean_success: 0.556 +- visibility_integral: 31.924 +- corridor_availability: 0.850 +- reocclusion_rate: 0.029 +- persistence_horizon_mae: 0.895 +- disturbance_cost: 0.286 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.583 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eb7af2bd2871f10553592409954b935d914cf98d --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/config_resolved.yaml @@ -0,0 +1,149 @@ +experiment_name: proxy_interaction_r3d_stage1_clip_seed8 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 8 +init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt +init_strict: false +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 224 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage1_seed8.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage1_seed8.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 8 +optim: + epochs: 4 + batch_size: 2 + num_workers: 4 + lr: 0.0003 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: true + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 512 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: false + fusion: + hidden_dim: 512 + num_cameras: 3 + num_layers: 4 + num_heads: 8 + ff_dim: 2048 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 512 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 8 + max_history_steps: 8 + decoder: + hidden_dim: 512 + num_heads: 8 + num_layers: 4 + ff_dim: 2048 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 512 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 8 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 512 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 8 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 512 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 8 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.1 + arm_role: 0.15 + support_mode: 0.1 + corridor: 0.15 + persistence: 0.05 + disturbance: 0.05 + world_model: 0.2 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.25 + planner_risk: 0.1 + planner_ranking: 0.2 + proposal_reconstruction: 0.1 + proposal_success: 0.15 + proposal_ranking: 0.2 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..15a6d8212f84fcd886cfbf4336788174e2b49d33 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.25984251968503935, + "planner_regret": 0.024652592837810516, + "planner_score_utility_spearman": 0.15748029947280884, + "risk_calibration_mse": 0.010109159164130688, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.02039325051009655, + "left_right_equivariance_error": 8.317838273796951e-05, + "belief_calibration_brier": 0.0039802417159080505, + "reocclusion_calibration_brier": 0.2667863667011261, + "support_stability_mae": 0.023258011788129807, + "clearance_auc": 0.9407927438472715, + "memory_write_rate": 0.0, + "memory_saturation": 0.5879086852073669, + "num_samples": 127 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..7307319477932bd083bfcbc36d02d39929231535 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.025799189747862168, + "arm_role": 0.027215735138398815, + "belief": 0.11522909954034222, + "clearance": 0.09597517975181809, + "corridor": 0.3045216482132673, + "disturbance": 0.006567074132739083, + "grasp_affordance": 0.02625927054055074, + "occluder_contact": 0.2161167692295544, + "persistence": 7.305491891831004, + "phase": 0.7473598300474477, + "planner_ranking": 0.14102927445574143, + "planner_risk": 0.014660530898254365, + "planner_success": 0.596433128830026, + "proposal_diversity": 0.0, + "proposal_ranking": 1.26868818193206, + "proposal_reconstruction": 0.06815405646387819, + "proposal_success": 0.6748700912710259, + "reocclusion": 0.7006335564308765, + "role_swap_consistency": 0.0005011227108655176, + "support_mode": 0.7077700629908377, + "support_stability": 0.1599257462645798, + "total": 1.733834327203441, + "uncertainty": 0.022427979406115357, + "visibility": 0.11316451830155562, + "world_model": 2.674901399312843 + }, + "val": { + "action": 0.02199536032276228, + "arm_role": 9.8040056428772e-06, + "belief": 0.0978035525768064, + "clearance": 0.07755720446584746, + "corridor": 0.24431297194678336, + "disturbance": 0.0019795258613157785, + "grasp_affordance": 0.008650467454572208, + "occluder_contact": 0.20205649081617594, + "persistence": 4.437129996716976, + "phase": 0.6695621414110065, + "planner_ranking": 0.04436381870164041, + "planner_risk": 0.010196975797498453, + "planner_success": 0.5646271030418575, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1638631131500006, + "proposal_reconstruction": 0.06484090705635026, + "proposal_success": 0.6649224627763033, + "reocclusion": 0.7438069470226765, + "role_swap_consistency": 0.0, + "support_mode": 0.673728191293776, + "support_stability": 0.13629821891663596, + "total": 1.4150245506316423, + "uncertainty": 0.002036258225416532, + "visibility": 0.09110353700816631, + "world_model": 2.210838695988059 + } + }, + { + "epoch": 1, + "train": { + "action": 0.02220674532499769, + "arm_role": 4.0168849585568094e-05, + "belief": 0.10375202887969491, + "clearance": 0.08468958432176663, + "corridor": 0.24882320250282114, + "disturbance": 0.002981857188692701, + "grasp_affordance": 0.00994103324857994, + "occluder_contact": 0.20824503820604054, + "persistence": 4.263324179262391, + "phase": 0.7222360341336714, + "planner_ranking": 0.044953017053952174, + "planner_risk": 0.010661984013600143, + "planner_success": 0.5370719069273684, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1506784087076236, + "proposal_reconstruction": 0.06470025059674422, + "proposal_success": 0.6748968515720667, + "reocclusion": 0.7042920837539652, + "role_swap_consistency": 0.00024932249915769023, + "support_mode": 0.6881518938154451, + "support_stability": 0.1487102357972979, + "total": 1.3995415040959862, + "uncertainty": 0.0019858729011069556, + "visibility": 0.09729615078156531, + "world_model": 2.178037493952906 + }, + "val": { + "action": 0.029678026388864964, + "arm_role": 0.0003116108114227245, + "belief": 0.10797233448829502, + "clearance": 0.08150003047194332, + "corridor": 0.2509052273235284, + "disturbance": 0.002103368451003007, + "grasp_affordance": 0.008963905274868011, + "occluder_contact": 0.2007133779115975, + "persistence": 4.478599248453975, + "phase": 0.7040554136037827, + "planner_ranking": 0.03813048706929578, + "planner_risk": 0.01057393318569666, + "planner_success": 0.5217722351662815, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1685641314834356, + "proposal_reconstruction": 0.07131227233912796, + "proposal_success": 0.6757729910314083, + "reocclusion": 0.6976062525063753, + "role_swap_consistency": 0.0, + "support_mode": 0.7273222031071782, + "support_stability": 0.1463006478443276, + "total": 1.3876731358468533, + "uncertainty": 0.0005028243003835087, + "visibility": 0.10090084094554186, + "world_model": 2.023001086898148 + } + }, + { + "epoch": 2, + "train": { + "action": 0.022834130358048446, + "arm_role": 3.6339485208401505e-05, + "belief": 0.10015391417978946, + "clearance": 0.08339313631243418, + "corridor": 0.24550532728082536, + "disturbance": 0.002419849791671015, + "grasp_affordance": 0.011102509094860541, + "occluder_contact": 0.20242435567041966, + "persistence": 4.354869382134127, + "phase": 0.6933721572316754, + "planner_ranking": 0.04187904763565859, + "planner_risk": 0.010259467963658331, + "planner_success": 0.5138571092283538, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1488539314394846, + "proposal_reconstruction": 0.06509613401758733, + "proposal_success": 0.6776590312962757, + "reocclusion": 0.70495132540221, + "role_swap_consistency": 0.0003516697920602868, + "support_mode": 0.6823001881544503, + "support_stability": 0.14350243961116718, + "total": 1.378995967473035, + "uncertainty": 0.0031733291824921203, + "visibility": 0.09716511293465555, + "world_model": 2.104598100584839 + }, + "val": { + "action": 0.02644303720444441, + "arm_role": 4.627731826190029e-06, + "belief": 0.10258024383801967, + "clearance": 0.07597982959123328, + "corridor": 0.2423992605181411, + "disturbance": 0.0015974244740846189, + "grasp_affordance": 0.007909159859991632, + "occluder_contact": 0.19435308501124382, + "persistence": 3.919285401701927, + "phase": 0.6770087121985853, + "planner_ranking": 0.030531517459166935, + "planner_risk": 0.010262692154356046, + "planner_success": 0.5169326290488243, + "proposal_diversity": 0.0, + "proposal_ranking": 1.138186807744205, + "proposal_reconstruction": 0.06911751109873876, + "proposal_success": 0.6695848302915692, + "reocclusion": 0.6975388880819082, + "role_swap_consistency": 0.0, + "support_mode": 0.6884247697889805, + "support_stability": 0.13594868587097153, + "total": 1.3366163168102503, + "uncertainty": 0.0006479808544099797, + "visibility": 0.09649082575924695, + "world_model": 2.0216304706409574 + } + }, + { + "epoch": 3, + "train": { + "action": 0.021160060905544235, + "arm_role": 5.587545364939105e-05, + "belief": 0.10077974488909956, + "clearance": 0.08377115065670762, + "corridor": 0.2723994788211522, + "disturbance": 0.0028603613238174243, + "grasp_affordance": 0.011514163958835196, + "occluder_contact": 0.20602131318983607, + "persistence": 3.0813600014851317, + "phase": 0.6817607779777487, + "planner_ranking": 0.031658034657560674, + "planner_risk": 0.010394540625284256, + "planner_success": 0.5069346120532271, + "proposal_diversity": 0.0, + "proposal_ranking": 1.132226309851202, + "proposal_reconstruction": 0.06328810811900967, + "proposal_success": 0.6744790461050902, + "reocclusion": 0.6852282721022661, + "role_swap_consistency": 0.0005754872515272832, + "support_mode": 0.6633978239528796, + "support_stability": 0.14488365837977468, + "total": 1.293662095569191, + "uncertainty": 0.0023333917296635863, + "visibility": 0.09853576490392235, + "world_model": 2.0413369105748482 + }, + "val": { + "action": 0.017367416352499276, + "arm_role": 7.692722565622034e-07, + "belief": 0.1027774921967648, + "clearance": 0.08752925635781139, + "corridor": 0.26156787533545867, + "disturbance": 0.0016430629628985116, + "grasp_affordance": 0.010058694657345768, + "occluder_contact": 0.21157401148229837, + "persistence": 1.0993698399979621, + "phase": 0.6142133427783847, + "planner_ranking": 0.03328441088268619, + "planner_risk": 0.010188427979301196, + "planner_success": 0.4918641885742545, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1239634547382593, + "proposal_reconstruction": 0.06056849448941648, + "proposal_success": 0.6778606250882149, + "reocclusion": 0.5640022717416286, + "role_swap_consistency": 0.0, + "support_mode": 0.5024671151768416, + "support_stability": 0.13648800805094652, + "total": 1.1350205279886723, + "uncertainty": 0.0008341338888158134, + "visibility": 0.0982570193009451, + "world_model": 1.93993010930717 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..692d7e32568f5777dfc09b58a949219b0197ed09 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/summary.json @@ -0,0 +1,557 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage1_clip_seed8", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed8/checkpoint_best.pt", + "final_train_total": 1.293662095569191, + "final_val_total": 1.1350205279886723, + "train_time_sec": 146.87081933021545, + "peak_gpu_memory_mb": 1891.1337890625, + "num_train_samples": 381, + "num_val_samples": 127, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": { + "path": "/workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt", + "loaded_keys": 461, + "skipped_shape_mismatch_keys": [ + "memory.gru.weight_ih_l0", + "memory.gru.weight_hh_l0", + "memory.gru.bias_ih_l0", + "memory.gru.bias_hh_l0", + "memory.token_proj.0.weight", + "memory.token_proj.0.bias", + "memory.token_proj.1.weight", + "memory.token_proj.1.bias", + "decoder.actor_role_bias", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.linear1.weight", + "decoder.revealer_decoder.layers.0.linear1.bias", + "decoder.revealer_decoder.layers.0.linear2.weight", + "decoder.revealer_decoder.layers.0.linear2.bias", + "decoder.revealer_decoder.layers.0.norm1.weight", + "decoder.revealer_decoder.layers.0.norm1.bias", + "decoder.revealer_decoder.layers.0.norm2.weight", + "decoder.revealer_decoder.layers.0.norm2.bias", + "decoder.revealer_decoder.layers.0.norm3.weight", + "decoder.revealer_decoder.layers.0.norm3.bias", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.linear1.weight", + "decoder.revealer_decoder.layers.1.linear1.bias", + "decoder.revealer_decoder.layers.1.linear2.weight", + "decoder.revealer_decoder.layers.1.linear2.bias", + "decoder.revealer_decoder.layers.1.norm1.weight", + "decoder.revealer_decoder.layers.1.norm1.bias", + "decoder.revealer_decoder.layers.1.norm2.weight", + "decoder.revealer_decoder.layers.1.norm2.bias", + "decoder.revealer_decoder.layers.1.norm3.weight", + "decoder.revealer_decoder.layers.1.norm3.bias", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.linear1.weight", + "decoder.revealer_decoder.layers.2.linear1.bias", + "decoder.revealer_decoder.layers.2.linear2.weight", + "decoder.revealer_decoder.layers.2.linear2.bias", + "decoder.revealer_decoder.layers.2.norm1.weight", + "decoder.revealer_decoder.layers.2.norm1.bias", + "decoder.revealer_decoder.layers.2.norm2.weight", + "decoder.revealer_decoder.layers.2.norm2.bias", + "decoder.revealer_decoder.layers.2.norm3.weight", + "decoder.revealer_decoder.layers.2.norm3.bias", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.linear1.weight", + "decoder.revealer_decoder.layers.3.linear1.bias", + "decoder.revealer_decoder.layers.3.linear2.weight", + "decoder.revealer_decoder.layers.3.linear2.bias", + "decoder.revealer_decoder.layers.3.norm1.weight", + "decoder.revealer_decoder.layers.3.norm1.bias", + "decoder.revealer_decoder.layers.3.norm2.weight", + "decoder.revealer_decoder.layers.3.norm2.bias", + "decoder.revealer_decoder.layers.3.norm3.weight", + "decoder.revealer_decoder.layers.3.norm3.bias", + "decoder.actor_decoder.layers.0.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.linear1.weight", + "decoder.actor_decoder.layers.0.linear1.bias", + "decoder.actor_decoder.layers.0.linear2.weight", + "decoder.actor_decoder.layers.0.linear2.bias", + "decoder.actor_decoder.layers.0.norm1.weight", + "decoder.actor_decoder.layers.0.norm1.bias", + "decoder.actor_decoder.layers.0.norm2.weight", + "decoder.actor_decoder.layers.0.norm2.bias", + "decoder.actor_decoder.layers.0.norm3.weight", + "decoder.actor_decoder.layers.0.norm3.bias", + "decoder.actor_decoder.layers.1.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.linear1.weight", + "decoder.actor_decoder.layers.1.linear1.bias", + "decoder.actor_decoder.layers.1.linear2.weight", + "decoder.actor_decoder.layers.1.linear2.bias", + "decoder.actor_decoder.layers.1.norm1.weight", + "decoder.actor_decoder.layers.1.norm1.bias", + "decoder.actor_decoder.layers.1.norm2.weight", + "decoder.actor_decoder.layers.1.norm2.bias", + "decoder.actor_decoder.layers.1.norm3.weight", + "decoder.actor_decoder.layers.1.norm3.bias", + "decoder.actor_decoder.layers.2.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.linear1.weight", + "decoder.actor_decoder.layers.2.linear1.bias", + "decoder.actor_decoder.layers.2.linear2.weight", + "decoder.actor_decoder.layers.2.linear2.bias", + "decoder.actor_decoder.layers.2.norm1.weight", + "decoder.actor_decoder.layers.2.norm1.bias", + "decoder.actor_decoder.layers.2.norm2.weight", + "decoder.actor_decoder.layers.2.norm2.bias", + "decoder.actor_decoder.layers.2.norm3.weight", + "decoder.actor_decoder.layers.2.norm3.bias", + "decoder.actor_decoder.layers.3.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.linear1.weight", + "decoder.actor_decoder.layers.3.linear1.bias", + "decoder.actor_decoder.layers.3.linear2.weight", + "decoder.actor_decoder.layers.3.linear2.bias", + "decoder.actor_decoder.layers.3.norm1.weight", + "decoder.actor_decoder.layers.3.norm1.bias", + "decoder.actor_decoder.layers.3.norm2.weight", + "decoder.actor_decoder.layers.3.norm2.bias", + "decoder.actor_decoder.layers.3.norm3.weight", + "decoder.actor_decoder.layers.3.norm3.bias", + "decoder.revealer_mean.weight", + "decoder.revealer_mean.bias", + "decoder.revealer_log_std.weight", + "decoder.revealer_log_std.bias", + "decoder.actor_mean.weight", + "decoder.actor_mean.bias", + "decoder.actor_log_std.weight", + "decoder.actor_log_std.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias" + ], + "missing_keys": [ + "backbone.depth_adapter.depth_proj.0.weight", + "backbone.depth_adapter.depth_proj.0.bias", + "backbone.depth_adapter.depth_proj.1.weight", + "backbone.depth_adapter.depth_proj.1.bias", + "backbone.depth_adapter.depth_proj.3.weight", + "backbone.depth_adapter.depth_proj.3.bias", + "backbone.depth_adapter.geometry_proj.0.weight", + "backbone.depth_adapter.geometry_proj.0.bias", + "backbone.depth_adapter.geometry_proj.1.weight", + "backbone.depth_adapter.geometry_proj.1.bias", + "backbone.depth_adapter.camera_proj.0.weight", + "backbone.depth_adapter.camera_proj.0.bias", + "backbone.depth_adapter.camera_proj.1.weight", + "backbone.depth_adapter.camera_proj.1.bias", + "fusion.geometry_fusion.attn.in_proj_weight", + "fusion.geometry_fusion.attn.in_proj_bias", + "fusion.geometry_fusion.attn.out_proj.weight", + "fusion.geometry_fusion.attn.out_proj.bias", + "fusion.geometry_fusion.gate.0.weight", + "fusion.geometry_fusion.gate.0.bias", + "fusion.geometry_fusion.gate.1.weight", + "fusion.geometry_fusion.gate.1.bias", + "fusion.geometry_fusion.gate.3.weight", + "fusion.geometry_fusion.gate.3.bias", + "fusion.geometry_fusion.out.0.weight", + "fusion.geometry_fusion.out.0.bias", + "fusion.geometry_fusion.out.1.weight", + "fusion.geometry_fusion.out.1.bias", + "memory.scene_memory.position_embedding", + "memory.scene_memory.bank_queries", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear1.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear1.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear2.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear2.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm1.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm1.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm2.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm2.bias", + "memory.scene_memory.bank_attention.in_proj_weight", + "memory.scene_memory.bank_attention.in_proj_bias", + "memory.scene_memory.bank_attention.out_proj.weight", + "memory.scene_memory.bank_attention.out_proj.bias", + "memory.scene_memory.action_proj.0.weight", + "memory.scene_memory.action_proj.0.bias", + "memory.scene_memory.action_proj.1.weight", + "memory.scene_memory.action_proj.1.bias", + "memory.scene_memory.write_gate.0.weight", + "memory.scene_memory.write_gate.0.bias", + "memory.scene_memory.write_gate.1.weight", + "memory.scene_memory.write_gate.1.bias", + "memory.scene_memory.write_gate.3.weight", + "memory.scene_memory.write_gate.3.bias", + "memory.scene_memory.token_proj.0.weight", + "memory.scene_memory.token_proj.0.bias", + "memory.scene_memory.token_proj.1.weight", + "memory.scene_memory.token_proj.1.bias", + "memory.belief_memory.position_embedding", + "memory.belief_memory.bank_queries", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear1.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear1.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear2.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear2.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm1.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm1.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm2.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm2.bias", + "memory.belief_memory.bank_attention.in_proj_weight", + "memory.belief_memory.bank_attention.in_proj_bias", + "memory.belief_memory.bank_attention.out_proj.weight", + "memory.belief_memory.bank_attention.out_proj.bias", + "memory.belief_memory.action_proj.0.weight", + "memory.belief_memory.action_proj.0.bias", + "memory.belief_memory.action_proj.1.weight", + "memory.belief_memory.action_proj.1.bias", + "memory.belief_memory.write_gate.0.weight", + "memory.belief_memory.write_gate.0.bias", + "memory.belief_memory.write_gate.1.weight", + "memory.belief_memory.write_gate.1.bias", + "memory.belief_memory.write_gate.3.weight", + "memory.belief_memory.write_gate.3.bias", + "memory.belief_memory.token_proj.0.weight", + "memory.belief_memory.token_proj.0.bias", + "memory.belief_memory.token_proj.1.weight", + "memory.belief_memory.token_proj.1.bias", + "decoder.arm_decoder.layers.0.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.linear1.weight", + "decoder.arm_decoder.layers.0.linear1.bias", + "decoder.arm_decoder.layers.0.linear2.weight", + "decoder.arm_decoder.layers.0.linear2.bias", + "decoder.arm_decoder.layers.0.norm1.weight", + "decoder.arm_decoder.layers.0.norm1.bias", + "decoder.arm_decoder.layers.0.norm2.weight", + "decoder.arm_decoder.layers.0.norm2.bias", + "decoder.arm_decoder.layers.0.norm3.weight", + "decoder.arm_decoder.layers.0.norm3.bias", + "decoder.arm_decoder.layers.1.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.linear1.weight", + "decoder.arm_decoder.layers.1.linear1.bias", + "decoder.arm_decoder.layers.1.linear2.weight", + "decoder.arm_decoder.layers.1.linear2.bias", + "decoder.arm_decoder.layers.1.norm1.weight", + "decoder.arm_decoder.layers.1.norm1.bias", + "decoder.arm_decoder.layers.1.norm2.weight", + "decoder.arm_decoder.layers.1.norm2.bias", + "decoder.arm_decoder.layers.1.norm3.weight", + "decoder.arm_decoder.layers.1.norm3.bias", + "decoder.arm_decoder.layers.2.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.linear1.weight", + "decoder.arm_decoder.layers.2.linear1.bias", + "decoder.arm_decoder.layers.2.linear2.weight", + "decoder.arm_decoder.layers.2.linear2.bias", + "decoder.arm_decoder.layers.2.norm1.weight", + "decoder.arm_decoder.layers.2.norm1.bias", + "decoder.arm_decoder.layers.2.norm2.weight", + "decoder.arm_decoder.layers.2.norm2.bias", + "decoder.arm_decoder.layers.2.norm3.weight", + "decoder.arm_decoder.layers.2.norm3.bias", + "decoder.arm_decoder.layers.3.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.linear1.weight", + "decoder.arm_decoder.layers.3.linear1.bias", + "decoder.arm_decoder.layers.3.linear2.weight", + "decoder.arm_decoder.layers.3.linear2.bias", + "decoder.arm_decoder.layers.3.norm1.weight", + "decoder.arm_decoder.layers.3.norm1.bias", + "decoder.arm_decoder.layers.3.norm2.weight", + "decoder.arm_decoder.layers.3.norm2.bias", + "decoder.arm_decoder.layers.3.norm3.weight", + "decoder.arm_decoder.layers.3.norm3.bias", + "decoder.arm_identity.weight", + "decoder.phase_adapter.weight", + "decoder.phase_adapter.bias", + "decoder.role_adapter.weight", + "decoder.role_adapter.bias", + "decoder.context_proj.0.weight", + "decoder.context_proj.0.bias", + "decoder.context_proj.1.weight", + "decoder.context_proj.1.bias", + "decoder.arm_head.0.weight", + "decoder.arm_head.0.bias", + "decoder.arm_head.1.weight", + "decoder.arm_head.1.bias", + "decoder.arm_mean.weight", + "decoder.arm_mean.bias", + "decoder.arm_log_std.weight", + "decoder.arm_log_std.bias", + "decoder.proposal_mode_head.0.weight", + "decoder.proposal_mode_head.0.bias", + "decoder.proposal_mode_head.1.weight", + "decoder.proposal_mode_head.1.bias", + "decoder.proposal_mode_head.3.weight", + "decoder.proposal_mode_head.3.bias", + "decoder.proposal_mode_embeddings.weight", + "decoder.proposal_slot_embeddings.weight", + "decoder.mode_residual_heads.0.0.weight", + "decoder.mode_residual_heads.0.0.bias", + "decoder.mode_residual_heads.0.1.weight", + "decoder.mode_residual_heads.0.1.bias", + "decoder.mode_residual_heads.0.3.weight", + "decoder.mode_residual_heads.0.3.bias", + "decoder.mode_residual_heads.1.0.weight", + "decoder.mode_residual_heads.1.0.bias", + "decoder.mode_residual_heads.1.1.weight", + "decoder.mode_residual_heads.1.1.bias", + "decoder.mode_residual_heads.1.3.weight", + "decoder.mode_residual_heads.1.3.bias", + "decoder.mode_residual_heads.2.0.weight", + "decoder.mode_residual_heads.2.0.bias", + "decoder.mode_residual_heads.2.1.weight", + "decoder.mode_residual_heads.2.1.bias", + "decoder.mode_residual_heads.2.3.weight", + "decoder.mode_residual_heads.2.3.bias", + "decoder.mode_residual_heads.3.0.weight", + "decoder.mode_residual_heads.3.0.bias", + "decoder.mode_residual_heads.3.1.weight", + "decoder.mode_residual_heads.3.1.bias", + "decoder.mode_residual_heads.3.3.weight", + "decoder.mode_residual_heads.3.3.bias", + "decoder.mode_residual_heads.4.0.weight", + "decoder.mode_residual_heads.4.0.bias", + "decoder.mode_residual_heads.4.1.weight", + "decoder.mode_residual_heads.4.1.bias", + "decoder.mode_residual_heads.4.3.weight", + "decoder.mode_residual_heads.4.3.bias", + "decoder.mode_residual_heads.5.0.weight", + "decoder.mode_residual_heads.5.0.bias", + "decoder.mode_residual_heads.5.1.weight", + "decoder.mode_residual_heads.5.1.bias", + "decoder.mode_residual_heads.5.3.weight", + "decoder.mode_residual_heads.5.3.bias", + "decoder.slot_delta.0.weight", + "decoder.slot_delta.0.bias", + "decoder.slot_delta.1.weight", + "decoder.slot_delta.1.bias", + "decoder.slot_delta.3.weight", + "decoder.slot_delta.3.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias", + "decoder.proposal_score.3.weight", + "decoder.proposal_score.3.bias", + "elastic_state_head.interaction_queries", + "elastic_state_head.interaction_attention.in_proj_weight", + "elastic_state_head.interaction_attention.in_proj_bias", + "elastic_state_head.interaction_attention.out_proj.weight", + "elastic_state_head.interaction_attention.out_proj.bias", + "elastic_state_head.interaction_mlp.0.weight", + "elastic_state_head.interaction_mlp.0.bias", + "elastic_state_head.interaction_mlp.1.weight", + "elastic_state_head.interaction_mlp.1.bias", + "elastic_state_head.interaction_mlp.3.weight", + "elastic_state_head.interaction_mlp.3.bias", + "elastic_state_head.decoder.field_queries", + "elastic_state_head.decoder.field_attention.in_proj_weight", + "elastic_state_head.decoder.field_attention.in_proj_bias", + "elastic_state_head.decoder.field_attention.out_proj.weight", + "elastic_state_head.decoder.field_attention.out_proj.bias", + "elastic_state_head.decoder.field_mlp.0.weight", + "elastic_state_head.decoder.field_mlp.0.bias", + "elastic_state_head.decoder.field_mlp.1.weight", + "elastic_state_head.decoder.field_mlp.1.bias", + "elastic_state_head.decoder.field_mlp.3.weight", + "elastic_state_head.decoder.field_mlp.3.bias", + "elastic_state_head.decoder.summary_proj.0.weight", + "elastic_state_head.decoder.summary_proj.0.bias", + "elastic_state_head.decoder.summary_proj.1.weight", + "elastic_state_head.decoder.summary_proj.1.bias", + "elastic_state_head.decoder.phase_head.0.weight", + "elastic_state_head.decoder.phase_head.0.bias", + "elastic_state_head.decoder.phase_head.1.weight", + "elastic_state_head.decoder.phase_head.1.bias", + "elastic_state_head.decoder.phase_head.3.weight", + "elastic_state_head.decoder.phase_head.3.bias", + "elastic_state_head.decoder.arm_role_head.0.weight", + "elastic_state_head.decoder.arm_role_head.0.bias", + "elastic_state_head.decoder.arm_role_head.1.weight", + "elastic_state_head.decoder.arm_role_head.1.bias", + "elastic_state_head.decoder.arm_role_head.3.weight", + "elastic_state_head.decoder.arm_role_head.3.bias", + "elastic_state_head.decoder.arm_identity.weight", + "elastic_state_head.decoder.support_mode.0.weight", + "elastic_state_head.decoder.support_mode.0.bias", + "elastic_state_head.decoder.support_mode.1.weight", + "elastic_state_head.decoder.support_mode.1.bias", + "elastic_state_head.decoder.support_mode.3.weight", + "elastic_state_head.decoder.support_mode.3.bias", + "elastic_state_head.decoder.access_field.weight", + "elastic_state_head.decoder.access_field.bias", + "elastic_state_head.decoder.target_belief_field.weight", + "elastic_state_head.decoder.target_belief_field.bias", + "elastic_state_head.decoder.visibility_field.weight", + "elastic_state_head.decoder.visibility_field.bias", + "elastic_state_head.decoder.clearance_field.weight", + "elastic_state_head.decoder.clearance_field.bias", + "elastic_state_head.decoder.occluder_contact_field.weight", + "elastic_state_head.decoder.occluder_contact_field.bias", + "elastic_state_head.decoder.grasp_affordance_field.weight", + "elastic_state_head.decoder.grasp_affordance_field.bias", + "elastic_state_head.decoder.support_stability_field.weight", + "elastic_state_head.decoder.support_stability_field.bias", + "elastic_state_head.decoder.persistence_field.weight", + "elastic_state_head.decoder.persistence_field.bias", + "elastic_state_head.decoder.reocclusion_field.weight", + "elastic_state_head.decoder.reocclusion_field.bias", + "elastic_state_head.decoder.disturbance_field.weight", + "elastic_state_head.decoder.disturbance_field.bias", + "elastic_state_head.decoder.uncertainty_field.weight", + "elastic_state_head.decoder.uncertainty_field.bias", + "elastic_state_head.decoder.reocclusion_head.0.weight", + "elastic_state_head.decoder.reocclusion_head.0.bias", + "elastic_state_head.decoder.reocclusion_head.1.weight", + "elastic_state_head.decoder.reocclusion_head.1.bias", + "elastic_state_head.decoder.reocclusion_head.3.weight", + "elastic_state_head.decoder.reocclusion_head.3.bias", + "world_model.state_encoder.0.weight", + "world_model.state_encoder.0.bias", + "world_model.state_encoder.1.weight", + "world_model.state_encoder.1.bias", + "world_model.scene_memory_proj.0.weight", + "world_model.scene_memory_proj.0.bias", + "world_model.scene_memory_proj.1.weight", + "world_model.scene_memory_proj.1.bias", + "world_model.belief_memory_proj.0.weight", + "world_model.belief_memory_proj.0.bias", + "world_model.belief_memory_proj.1.weight", + "world_model.belief_memory_proj.1.bias", + "world_model.action_encoder.0.weight", + "world_model.action_encoder.0.bias", + "world_model.action_encoder.1.weight", + "world_model.action_encoder.1.bias", + "world_model.transition.weight_ih", + "world_model.transition.weight_hh", + "world_model.transition.bias_ih", + "world_model.transition.bias_hh", + "world_model.scene_memory_update.weight", + "world_model.scene_memory_update.bias", + "world_model.belief_memory_update.weight", + "world_model.belief_memory_update.bias", + "world_model.compact_decoder.weight", + "world_model.compact_decoder.bias", + "world_model.target_belief_head.weight", + "world_model.target_belief_head.bias", + "world_model.visibility_head.weight", + "world_model.visibility_head.bias", + "world_model.clearance_head.weight", + "world_model.clearance_head.bias", + "world_model.occluder_contact_head.weight", + "world_model.occluder_contact_head.bias", + "world_model.grasp_affordance_head.weight", + "world_model.grasp_affordance_head.bias", + "world_model.support_stability_head.weight", + "world_model.support_stability_head.bias", + "world_model.persistence_head.weight", + "world_model.persistence_head.bias", + "world_model.reocclusion_head.weight", + "world_model.reocclusion_head.bias", + "world_model.disturbance_head.weight", + "world_model.disturbance_head.bias", + "world_model.uncertainty_head.weight", + "world_model.uncertainty_head.bias", + "world_model.access_head.weight", + "world_model.access_head.bias", + "planner.residual.trunk.0.weight", + "planner.residual.trunk.0.bias", + "planner.residual.trunk.1.weight", + "planner.residual.trunk.1.bias", + "planner.residual.trunk.3.weight", + "planner.residual.trunk.3.bias", + "planner.residual.success_head.weight", + "planner.residual.success_head.bias", + "planner.residual.risk_head.weight", + "planner.residual.risk_head.bias", + "planner.residual.residual_head.weight", + "planner.residual.residual_head.bias" + ], + "unexpected_keys": [] + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..f9f4afaf8b7e90803ec0958bb386e00ff8fb571a --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.5, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5694444444444443, + "visibility_integral": 32.623872251146366, + "corridor_availability": 0.889709601799647, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.1627785900766536, + "disturbance_cost": 0.2332938505957524 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..f18ad9ff0d4589962fc44ae064bebaa8a51fd460 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/checkpoint_best.pt +- mean_success: 0.569 +- visibility_integral: 32.624 +- corridor_availability: 0.890 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.163 +- disturbance_cost: 0.233 +- foliage_proxy_success: 0.500 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..426f4d7345307a7df8fe8fff3536b2bef0b1a763 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/config_resolved.yaml @@ -0,0 +1,149 @@ +experiment_name: proxy_interaction_r3d_stage1_clip_seed9 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 9 +init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt +init_strict: false +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 224 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage1_seed9.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage1_seed9.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 9 +optim: + epochs: 4 + batch_size: 2 + num_workers: 4 + lr: 0.0003 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: true + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 512 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: false + fusion: + hidden_dim: 512 + num_cameras: 3 + num_layers: 4 + num_heads: 8 + ff_dim: 2048 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 512 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 8 + max_history_steps: 8 + decoder: + hidden_dim: 512 + num_heads: 8 + num_layers: 4 + ff_dim: 2048 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 512 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 8 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 512 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 8 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 512 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 8 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.1 + arm_role: 0.15 + support_mode: 0.1 + corridor: 0.15 + persistence: 0.05 + disturbance: 0.05 + world_model: 0.2 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.25 + planner_risk: 0.1 + planner_ranking: 0.2 + proposal_reconstruction: 0.1 + proposal_success: 0.15 + proposal_ranking: 0.2 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..2647e97370154cf256897d5dd2051143d5f74d5c --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.2890625, + "planner_regret": 0.02300698682665825, + "planner_score_utility_spearman": 0.22968751192092896, + "risk_calibration_mse": 0.010304542258381844, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.022611485794186592, + "left_right_equivariance_error": 8.689248215887346e-05, + "belief_calibration_brier": 0.0043337177485227585, + "reocclusion_calibration_brier": 0.22800305485725403, + "support_stability_mae": 0.02859283983707428, + "clearance_auc": 0.6329041426155311, + "memory_write_rate": 0.0, + "memory_saturation": 0.2469944953918457, + "num_samples": 128 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..599d894dece12750e4f4bbe429d7b3385b8d9cbf --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.027812569460978633, + "arm_role": 0.030137697646492406, + "belief": 0.12157152328444154, + "clearance": 0.09282162053216444, + "corridor": 0.2851656379864404, + "disturbance": 0.004553798715077344, + "grasp_affordance": 0.018851539715634365, + "occluder_contact": 0.2132460696916831, + "persistence": 5.642576662878807, + "phase": 0.7761939600894325, + "planner_ranking": 0.17902961440620282, + "planner_risk": 0.013923984336035668, + "planner_success": 0.6199151214800382, + "proposal_diversity": 0.0, + "proposal_ranking": 1.2823116054660395, + "proposal_reconstruction": 0.06912861580127164, + "proposal_success": 0.6811760576147782, + "reocclusion": 0.7353295496419856, + "role_swap_consistency": 0.0005873552748725113, + "support_mode": 0.7828435195119757, + "support_stability": 0.16347284512594343, + "total": 1.6866143584251403, + "uncertainty": 0.019001170223897423, + "visibility": 0.11754893544865282, + "world_model": 2.710779071795313 + }, + "val": { + "action": 0.02170204828144051, + "arm_role": 6.762321064002208e-06, + "belief": 0.10080993873998523, + "clearance": 0.08166962582617998, + "corridor": 0.23909102065954357, + "disturbance": 0.001983066906802833, + "grasp_affordance": 0.008535019573173486, + "occluder_contact": 0.2112727805506438, + "persistence": 3.857563339173794, + "phase": 0.6654304726980627, + "planner_ranking": 0.04032187890697969, + "planner_risk": 0.011350331830726645, + "planner_success": 0.5934910103678703, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1493350621312857, + "proposal_reconstruction": 0.06338102876907215, + "proposal_success": 0.6806164355948567, + "reocclusion": 0.6909330077469349, + "role_swap_consistency": 0.0, + "support_mode": 0.6831411011517048, + "support_stability": 0.13910080850473605, + "total": 1.458911145105958, + "uncertainty": 0.0033405375688744243, + "visibility": 0.09547075629234314, + "world_model": 2.5560860373079777 + } + }, + { + "epoch": 1, + "train": { + "action": 0.023493385471795733, + "arm_role": 0.0002928718140250758, + "belief": 0.10523007610126546, + "clearance": 0.08677955961933262, + "corridor": 0.25750191186211613, + "disturbance": 0.0031594517295421777, + "grasp_affordance": 0.01005841078187682, + "occluder_contact": 0.20920588836858148, + "persistence": 4.331643560058192, + "phase": 0.7189607319078948, + "planner_ranking": 0.05423959079287933, + "planner_risk": 0.010427037446980217, + "planner_success": 0.5849820621703801, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1505002517449228, + "proposal_reconstruction": 0.06525950771021216, + "proposal_success": 0.6752778025049913, + "reocclusion": 0.7005268357302014, + "role_swap_consistency": 0.0007142310405278726, + "support_mode": 0.70107421875, + "support_stability": 0.14081861141480898, + "total": 1.432289683818817, + "uncertainty": 0.002551493341237993, + "visibility": 0.10134971671198544, + "world_model": 2.237849539204648 + }, + "val": { + "action": 0.021186921891057864, + "arm_role": 3.6694105953749556e-07, + "belief": 0.09995241661090404, + "clearance": 0.08146111795213073, + "corridor": 0.24082361184991896, + "disturbance": 0.001976304362585779, + "grasp_affordance": 0.00922958003502572, + "occluder_contact": 0.21127386414445937, + "persistence": 3.7571401111781597, + "phase": 0.6817005267366767, + "planner_ranking": 0.03515352255374182, + "planner_risk": 0.01038273600534012, + "planner_success": 0.5073812543414533, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1285581476986408, + "proposal_reconstruction": 0.0629420520272106, + "proposal_success": 0.6745674163103104, + "reocclusion": 0.6919681001454592, + "role_swap_consistency": 0.0, + "support_mode": 0.6647901809774339, + "support_stability": 0.14570825529517606, + "total": 1.3415670674294233, + "uncertainty": 0.0013466343752952525, + "visibility": 0.09475092665525153, + "world_model": 2.1340785464271903 + } + }, + { + "epoch": 2, + "train": { + "action": 0.021538028542540576, + "arm_role": 2.1901883577045642e-05, + "belief": 0.10526431232298675, + "clearance": 0.08594944182979433, + "corridor": 0.24735975777240177, + "disturbance": 0.0026733651749964336, + "grasp_affordance": 0.010091915089440974, + "occluder_contact": 0.20871730721310566, + "persistence": 4.281911664887478, + "phase": 0.6870194284539474, + "planner_ranking": 0.04152601579832519, + "planner_risk": 0.01045033406331449, + "planner_success": 0.5353652712545897, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1453557397189893, + "proposal_reconstruction": 0.06370952629337186, + "proposal_success": 0.6778088651205364, + "reocclusion": 0.6986164701612372, + "role_swap_consistency": 0.0004750598012929243, + "support_mode": 0.6878212376644737, + "support_stability": 0.1362508504700504, + "total": 1.384049719885776, + "uncertainty": 0.001396400365047157, + "visibility": 0.09892214826847377, + "world_model": 2.1307888821551675 + }, + "val": { + "action": 0.021681111145881005, + "arm_role": 0.0003864255304506514, + "belief": 0.10844068287406117, + "clearance": 0.08775011514080688, + "corridor": 0.23830276518128812, + "disturbance": 0.0019835491895037194, + "grasp_affordance": 0.011450761739979498, + "occluder_contact": 0.21598492935299873, + "persistence": 3.682887438684702, + "phase": 0.6754010105505586, + "planner_ranking": 0.03584061572041719, + "planner_risk": 0.010325502114255869, + "planner_success": 0.49944606237113476, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1196386851370335, + "proposal_reconstruction": 0.0637086319620721, + "proposal_success": 0.6784614324569702, + "reocclusion": 0.6908501861616969, + "role_swap_consistency": 0.0, + "support_mode": 0.6635435968637466, + "support_stability": 0.14290154923219234, + "total": 1.3013203730806708, + "uncertainty": 0.002612559406315995, + "visibility": 0.10054636449785903, + "world_model": 1.9632274899631739 + } + }, + { + "epoch": 3, + "train": { + "action": 0.02116909674123714, + "arm_role": 0.00017300687338176526, + "belief": 0.10208533270970771, + "clearance": 0.08287150121637081, + "corridor": 0.24314571875882776, + "disturbance": 0.002553280315360577, + "grasp_affordance": 0.010202447837218642, + "occluder_contact": 0.20370756677891078, + "persistence": 3.4343402633541507, + "phase": 0.6811472039473684, + "planner_ranking": 0.03300265433170257, + "planner_risk": 0.010154466018828221, + "planner_success": 0.5132313249338615, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1288216785380716, + "proposal_reconstruction": 0.06323393973472871, + "proposal_success": 0.6770071575516149, + "reocclusion": 0.7064933630980943, + "role_swap_consistency": 0.0003766025873023625, + "support_mode": 0.7007555509868421, + "support_stability": 0.1340178519732466, + "total": 1.314924956309168, + "uncertainty": 0.0012071453580622467, + "visibility": 0.09558045302370662, + "world_model": 2.054408212398228 + }, + "val": { + "action": 0.021696553943911567, + "arm_role": 6.053594985289124e-07, + "belief": 0.0983218071050942, + "clearance": 0.07689482159912586, + "corridor": 0.29242096332018264, + "disturbance": 0.0041615761442699295, + "grasp_affordance": 0.0100187708158046, + "occluder_contact": 0.19618010916747153, + "persistence": 4.662721422035247, + "phase": 0.6692422716878355, + "planner_ranking": 0.030305169929533804, + "planner_risk": 0.010842124038390466, + "planner_success": 0.5005343491211534, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1591037698090076, + "proposal_reconstruction": 0.06389545585261658, + "proposal_success": 0.6826766086742282, + "reocclusion": 0.7785650952719152, + "role_swap_consistency": 0.0, + "support_mode": 0.6616131067276001, + "support_stability": 0.1388778503460344, + "total": 1.3739404007792473, + "uncertainty": 2.288464340693963e-05, + "visibility": 0.09415236074710265, + "world_model": 1.9970475500449538 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..32d58b6061f625b571b5d8838426b6fce2c685f1 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/summary.json @@ -0,0 +1,557 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage1_clip_seed9", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_clip_seed9/checkpoint_best.pt", + "final_train_total": 1.314924956309168, + "final_val_total": 1.3739404007792473, + "train_time_sec": 146.7574381828308, + "peak_gpu_memory_mb": 1915.8154296875, + "num_train_samples": 380, + "num_val_samples": 128, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": { + "path": "/workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt", + "loaded_keys": 461, + "skipped_shape_mismatch_keys": [ + "memory.gru.weight_ih_l0", + "memory.gru.weight_hh_l0", + "memory.gru.bias_ih_l0", + "memory.gru.bias_hh_l0", + "memory.token_proj.0.weight", + "memory.token_proj.0.bias", + "memory.token_proj.1.weight", + "memory.token_proj.1.bias", + "decoder.actor_role_bias", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.linear1.weight", + "decoder.revealer_decoder.layers.0.linear1.bias", + "decoder.revealer_decoder.layers.0.linear2.weight", + "decoder.revealer_decoder.layers.0.linear2.bias", + "decoder.revealer_decoder.layers.0.norm1.weight", + "decoder.revealer_decoder.layers.0.norm1.bias", + "decoder.revealer_decoder.layers.0.norm2.weight", + "decoder.revealer_decoder.layers.0.norm2.bias", + "decoder.revealer_decoder.layers.0.norm3.weight", + "decoder.revealer_decoder.layers.0.norm3.bias", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.linear1.weight", + "decoder.revealer_decoder.layers.1.linear1.bias", + "decoder.revealer_decoder.layers.1.linear2.weight", + "decoder.revealer_decoder.layers.1.linear2.bias", + "decoder.revealer_decoder.layers.1.norm1.weight", + "decoder.revealer_decoder.layers.1.norm1.bias", + "decoder.revealer_decoder.layers.1.norm2.weight", + "decoder.revealer_decoder.layers.1.norm2.bias", + "decoder.revealer_decoder.layers.1.norm3.weight", + "decoder.revealer_decoder.layers.1.norm3.bias", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.linear1.weight", + "decoder.revealer_decoder.layers.2.linear1.bias", + "decoder.revealer_decoder.layers.2.linear2.weight", + "decoder.revealer_decoder.layers.2.linear2.bias", + "decoder.revealer_decoder.layers.2.norm1.weight", + "decoder.revealer_decoder.layers.2.norm1.bias", + "decoder.revealer_decoder.layers.2.norm2.weight", + "decoder.revealer_decoder.layers.2.norm2.bias", + "decoder.revealer_decoder.layers.2.norm3.weight", + "decoder.revealer_decoder.layers.2.norm3.bias", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.linear1.weight", + "decoder.revealer_decoder.layers.3.linear1.bias", + "decoder.revealer_decoder.layers.3.linear2.weight", + "decoder.revealer_decoder.layers.3.linear2.bias", + "decoder.revealer_decoder.layers.3.norm1.weight", + "decoder.revealer_decoder.layers.3.norm1.bias", + "decoder.revealer_decoder.layers.3.norm2.weight", + "decoder.revealer_decoder.layers.3.norm2.bias", + "decoder.revealer_decoder.layers.3.norm3.weight", + "decoder.revealer_decoder.layers.3.norm3.bias", + "decoder.actor_decoder.layers.0.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.linear1.weight", + "decoder.actor_decoder.layers.0.linear1.bias", + "decoder.actor_decoder.layers.0.linear2.weight", + "decoder.actor_decoder.layers.0.linear2.bias", + "decoder.actor_decoder.layers.0.norm1.weight", + "decoder.actor_decoder.layers.0.norm1.bias", + "decoder.actor_decoder.layers.0.norm2.weight", + "decoder.actor_decoder.layers.0.norm2.bias", + "decoder.actor_decoder.layers.0.norm3.weight", + "decoder.actor_decoder.layers.0.norm3.bias", + "decoder.actor_decoder.layers.1.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.linear1.weight", + "decoder.actor_decoder.layers.1.linear1.bias", + "decoder.actor_decoder.layers.1.linear2.weight", + "decoder.actor_decoder.layers.1.linear2.bias", + "decoder.actor_decoder.layers.1.norm1.weight", + "decoder.actor_decoder.layers.1.norm1.bias", + "decoder.actor_decoder.layers.1.norm2.weight", + "decoder.actor_decoder.layers.1.norm2.bias", + "decoder.actor_decoder.layers.1.norm3.weight", + "decoder.actor_decoder.layers.1.norm3.bias", + "decoder.actor_decoder.layers.2.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.linear1.weight", + "decoder.actor_decoder.layers.2.linear1.bias", + "decoder.actor_decoder.layers.2.linear2.weight", + "decoder.actor_decoder.layers.2.linear2.bias", + "decoder.actor_decoder.layers.2.norm1.weight", + "decoder.actor_decoder.layers.2.norm1.bias", + "decoder.actor_decoder.layers.2.norm2.weight", + "decoder.actor_decoder.layers.2.norm2.bias", + "decoder.actor_decoder.layers.2.norm3.weight", + "decoder.actor_decoder.layers.2.norm3.bias", + "decoder.actor_decoder.layers.3.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.linear1.weight", + "decoder.actor_decoder.layers.3.linear1.bias", + "decoder.actor_decoder.layers.3.linear2.weight", + "decoder.actor_decoder.layers.3.linear2.bias", + "decoder.actor_decoder.layers.3.norm1.weight", + "decoder.actor_decoder.layers.3.norm1.bias", + "decoder.actor_decoder.layers.3.norm2.weight", + "decoder.actor_decoder.layers.3.norm2.bias", + "decoder.actor_decoder.layers.3.norm3.weight", + "decoder.actor_decoder.layers.3.norm3.bias", + "decoder.revealer_mean.weight", + "decoder.revealer_mean.bias", + "decoder.revealer_log_std.weight", + "decoder.revealer_log_std.bias", + "decoder.actor_mean.weight", + "decoder.actor_mean.bias", + "decoder.actor_log_std.weight", + "decoder.actor_log_std.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias" + ], + "missing_keys": [ + "backbone.depth_adapter.depth_proj.0.weight", + "backbone.depth_adapter.depth_proj.0.bias", + "backbone.depth_adapter.depth_proj.1.weight", + "backbone.depth_adapter.depth_proj.1.bias", + "backbone.depth_adapter.depth_proj.3.weight", + "backbone.depth_adapter.depth_proj.3.bias", + "backbone.depth_adapter.geometry_proj.0.weight", + "backbone.depth_adapter.geometry_proj.0.bias", + "backbone.depth_adapter.geometry_proj.1.weight", + "backbone.depth_adapter.geometry_proj.1.bias", + "backbone.depth_adapter.camera_proj.0.weight", + "backbone.depth_adapter.camera_proj.0.bias", + "backbone.depth_adapter.camera_proj.1.weight", + "backbone.depth_adapter.camera_proj.1.bias", + "fusion.geometry_fusion.attn.in_proj_weight", + "fusion.geometry_fusion.attn.in_proj_bias", + "fusion.geometry_fusion.attn.out_proj.weight", + "fusion.geometry_fusion.attn.out_proj.bias", + "fusion.geometry_fusion.gate.0.weight", + "fusion.geometry_fusion.gate.0.bias", + "fusion.geometry_fusion.gate.1.weight", + "fusion.geometry_fusion.gate.1.bias", + "fusion.geometry_fusion.gate.3.weight", + "fusion.geometry_fusion.gate.3.bias", + "fusion.geometry_fusion.out.0.weight", + "fusion.geometry_fusion.out.0.bias", + "fusion.geometry_fusion.out.1.weight", + "fusion.geometry_fusion.out.1.bias", + "memory.scene_memory.position_embedding", + "memory.scene_memory.bank_queries", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear1.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear1.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear2.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear2.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm1.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm1.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm2.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm2.bias", + "memory.scene_memory.bank_attention.in_proj_weight", + "memory.scene_memory.bank_attention.in_proj_bias", + "memory.scene_memory.bank_attention.out_proj.weight", + "memory.scene_memory.bank_attention.out_proj.bias", + "memory.scene_memory.action_proj.0.weight", + "memory.scene_memory.action_proj.0.bias", + "memory.scene_memory.action_proj.1.weight", + "memory.scene_memory.action_proj.1.bias", + "memory.scene_memory.write_gate.0.weight", + "memory.scene_memory.write_gate.0.bias", + "memory.scene_memory.write_gate.1.weight", + "memory.scene_memory.write_gate.1.bias", + "memory.scene_memory.write_gate.3.weight", + "memory.scene_memory.write_gate.3.bias", + "memory.scene_memory.token_proj.0.weight", + "memory.scene_memory.token_proj.0.bias", + "memory.scene_memory.token_proj.1.weight", + "memory.scene_memory.token_proj.1.bias", + "memory.belief_memory.position_embedding", + "memory.belief_memory.bank_queries", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear1.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear1.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear2.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear2.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm1.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm1.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm2.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm2.bias", + "memory.belief_memory.bank_attention.in_proj_weight", + "memory.belief_memory.bank_attention.in_proj_bias", + "memory.belief_memory.bank_attention.out_proj.weight", + "memory.belief_memory.bank_attention.out_proj.bias", + "memory.belief_memory.action_proj.0.weight", + "memory.belief_memory.action_proj.0.bias", + "memory.belief_memory.action_proj.1.weight", + "memory.belief_memory.action_proj.1.bias", + "memory.belief_memory.write_gate.0.weight", + "memory.belief_memory.write_gate.0.bias", + "memory.belief_memory.write_gate.1.weight", + "memory.belief_memory.write_gate.1.bias", + "memory.belief_memory.write_gate.3.weight", + "memory.belief_memory.write_gate.3.bias", + "memory.belief_memory.token_proj.0.weight", + "memory.belief_memory.token_proj.0.bias", + "memory.belief_memory.token_proj.1.weight", + "memory.belief_memory.token_proj.1.bias", + "decoder.arm_decoder.layers.0.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.linear1.weight", + "decoder.arm_decoder.layers.0.linear1.bias", + "decoder.arm_decoder.layers.0.linear2.weight", + "decoder.arm_decoder.layers.0.linear2.bias", + "decoder.arm_decoder.layers.0.norm1.weight", + "decoder.arm_decoder.layers.0.norm1.bias", + "decoder.arm_decoder.layers.0.norm2.weight", + "decoder.arm_decoder.layers.0.norm2.bias", + "decoder.arm_decoder.layers.0.norm3.weight", + "decoder.arm_decoder.layers.0.norm3.bias", + "decoder.arm_decoder.layers.1.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.linear1.weight", + "decoder.arm_decoder.layers.1.linear1.bias", + "decoder.arm_decoder.layers.1.linear2.weight", + "decoder.arm_decoder.layers.1.linear2.bias", + "decoder.arm_decoder.layers.1.norm1.weight", + "decoder.arm_decoder.layers.1.norm1.bias", + "decoder.arm_decoder.layers.1.norm2.weight", + "decoder.arm_decoder.layers.1.norm2.bias", + "decoder.arm_decoder.layers.1.norm3.weight", + "decoder.arm_decoder.layers.1.norm3.bias", + "decoder.arm_decoder.layers.2.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.linear1.weight", + "decoder.arm_decoder.layers.2.linear1.bias", + "decoder.arm_decoder.layers.2.linear2.weight", + "decoder.arm_decoder.layers.2.linear2.bias", + "decoder.arm_decoder.layers.2.norm1.weight", + "decoder.arm_decoder.layers.2.norm1.bias", + "decoder.arm_decoder.layers.2.norm2.weight", + "decoder.arm_decoder.layers.2.norm2.bias", + "decoder.arm_decoder.layers.2.norm3.weight", + "decoder.arm_decoder.layers.2.norm3.bias", + "decoder.arm_decoder.layers.3.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.linear1.weight", + "decoder.arm_decoder.layers.3.linear1.bias", + "decoder.arm_decoder.layers.3.linear2.weight", + "decoder.arm_decoder.layers.3.linear2.bias", + "decoder.arm_decoder.layers.3.norm1.weight", + "decoder.arm_decoder.layers.3.norm1.bias", + "decoder.arm_decoder.layers.3.norm2.weight", + "decoder.arm_decoder.layers.3.norm2.bias", + "decoder.arm_decoder.layers.3.norm3.weight", + "decoder.arm_decoder.layers.3.norm3.bias", + "decoder.arm_identity.weight", + "decoder.phase_adapter.weight", + "decoder.phase_adapter.bias", + "decoder.role_adapter.weight", + "decoder.role_adapter.bias", + "decoder.context_proj.0.weight", + "decoder.context_proj.0.bias", + "decoder.context_proj.1.weight", + "decoder.context_proj.1.bias", + "decoder.arm_head.0.weight", + "decoder.arm_head.0.bias", + "decoder.arm_head.1.weight", + "decoder.arm_head.1.bias", + "decoder.arm_mean.weight", + "decoder.arm_mean.bias", + "decoder.arm_log_std.weight", + "decoder.arm_log_std.bias", + "decoder.proposal_mode_head.0.weight", + "decoder.proposal_mode_head.0.bias", + "decoder.proposal_mode_head.1.weight", + "decoder.proposal_mode_head.1.bias", + "decoder.proposal_mode_head.3.weight", + "decoder.proposal_mode_head.3.bias", + "decoder.proposal_mode_embeddings.weight", + "decoder.proposal_slot_embeddings.weight", + "decoder.mode_residual_heads.0.0.weight", + "decoder.mode_residual_heads.0.0.bias", + "decoder.mode_residual_heads.0.1.weight", + "decoder.mode_residual_heads.0.1.bias", + "decoder.mode_residual_heads.0.3.weight", + "decoder.mode_residual_heads.0.3.bias", + "decoder.mode_residual_heads.1.0.weight", + "decoder.mode_residual_heads.1.0.bias", + "decoder.mode_residual_heads.1.1.weight", + "decoder.mode_residual_heads.1.1.bias", + "decoder.mode_residual_heads.1.3.weight", + "decoder.mode_residual_heads.1.3.bias", + "decoder.mode_residual_heads.2.0.weight", + "decoder.mode_residual_heads.2.0.bias", + "decoder.mode_residual_heads.2.1.weight", + "decoder.mode_residual_heads.2.1.bias", + "decoder.mode_residual_heads.2.3.weight", + "decoder.mode_residual_heads.2.3.bias", + "decoder.mode_residual_heads.3.0.weight", + "decoder.mode_residual_heads.3.0.bias", + "decoder.mode_residual_heads.3.1.weight", + "decoder.mode_residual_heads.3.1.bias", + "decoder.mode_residual_heads.3.3.weight", + "decoder.mode_residual_heads.3.3.bias", + "decoder.mode_residual_heads.4.0.weight", + "decoder.mode_residual_heads.4.0.bias", + "decoder.mode_residual_heads.4.1.weight", + "decoder.mode_residual_heads.4.1.bias", + "decoder.mode_residual_heads.4.3.weight", + "decoder.mode_residual_heads.4.3.bias", + "decoder.mode_residual_heads.5.0.weight", + "decoder.mode_residual_heads.5.0.bias", + "decoder.mode_residual_heads.5.1.weight", + "decoder.mode_residual_heads.5.1.bias", + "decoder.mode_residual_heads.5.3.weight", + "decoder.mode_residual_heads.5.3.bias", + "decoder.slot_delta.0.weight", + "decoder.slot_delta.0.bias", + "decoder.slot_delta.1.weight", + "decoder.slot_delta.1.bias", + "decoder.slot_delta.3.weight", + "decoder.slot_delta.3.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias", + "decoder.proposal_score.3.weight", + "decoder.proposal_score.3.bias", + "elastic_state_head.interaction_queries", + "elastic_state_head.interaction_attention.in_proj_weight", + "elastic_state_head.interaction_attention.in_proj_bias", + "elastic_state_head.interaction_attention.out_proj.weight", + "elastic_state_head.interaction_attention.out_proj.bias", + "elastic_state_head.interaction_mlp.0.weight", + "elastic_state_head.interaction_mlp.0.bias", + "elastic_state_head.interaction_mlp.1.weight", + "elastic_state_head.interaction_mlp.1.bias", + "elastic_state_head.interaction_mlp.3.weight", + "elastic_state_head.interaction_mlp.3.bias", + "elastic_state_head.decoder.field_queries", + "elastic_state_head.decoder.field_attention.in_proj_weight", + "elastic_state_head.decoder.field_attention.in_proj_bias", + "elastic_state_head.decoder.field_attention.out_proj.weight", + "elastic_state_head.decoder.field_attention.out_proj.bias", + "elastic_state_head.decoder.field_mlp.0.weight", + "elastic_state_head.decoder.field_mlp.0.bias", + "elastic_state_head.decoder.field_mlp.1.weight", + "elastic_state_head.decoder.field_mlp.1.bias", + "elastic_state_head.decoder.field_mlp.3.weight", + "elastic_state_head.decoder.field_mlp.3.bias", + "elastic_state_head.decoder.summary_proj.0.weight", + "elastic_state_head.decoder.summary_proj.0.bias", + "elastic_state_head.decoder.summary_proj.1.weight", + "elastic_state_head.decoder.summary_proj.1.bias", + "elastic_state_head.decoder.phase_head.0.weight", + "elastic_state_head.decoder.phase_head.0.bias", + "elastic_state_head.decoder.phase_head.1.weight", + "elastic_state_head.decoder.phase_head.1.bias", + "elastic_state_head.decoder.phase_head.3.weight", + "elastic_state_head.decoder.phase_head.3.bias", + "elastic_state_head.decoder.arm_role_head.0.weight", + "elastic_state_head.decoder.arm_role_head.0.bias", + "elastic_state_head.decoder.arm_role_head.1.weight", + "elastic_state_head.decoder.arm_role_head.1.bias", + "elastic_state_head.decoder.arm_role_head.3.weight", + "elastic_state_head.decoder.arm_role_head.3.bias", + "elastic_state_head.decoder.arm_identity.weight", + "elastic_state_head.decoder.support_mode.0.weight", + "elastic_state_head.decoder.support_mode.0.bias", + "elastic_state_head.decoder.support_mode.1.weight", + "elastic_state_head.decoder.support_mode.1.bias", + "elastic_state_head.decoder.support_mode.3.weight", + "elastic_state_head.decoder.support_mode.3.bias", + "elastic_state_head.decoder.access_field.weight", + "elastic_state_head.decoder.access_field.bias", + "elastic_state_head.decoder.target_belief_field.weight", + "elastic_state_head.decoder.target_belief_field.bias", + "elastic_state_head.decoder.visibility_field.weight", + "elastic_state_head.decoder.visibility_field.bias", + "elastic_state_head.decoder.clearance_field.weight", + "elastic_state_head.decoder.clearance_field.bias", + "elastic_state_head.decoder.occluder_contact_field.weight", + "elastic_state_head.decoder.occluder_contact_field.bias", + "elastic_state_head.decoder.grasp_affordance_field.weight", + "elastic_state_head.decoder.grasp_affordance_field.bias", + "elastic_state_head.decoder.support_stability_field.weight", + "elastic_state_head.decoder.support_stability_field.bias", + "elastic_state_head.decoder.persistence_field.weight", + "elastic_state_head.decoder.persistence_field.bias", + "elastic_state_head.decoder.reocclusion_field.weight", + "elastic_state_head.decoder.reocclusion_field.bias", + "elastic_state_head.decoder.disturbance_field.weight", + "elastic_state_head.decoder.disturbance_field.bias", + "elastic_state_head.decoder.uncertainty_field.weight", + "elastic_state_head.decoder.uncertainty_field.bias", + "elastic_state_head.decoder.reocclusion_head.0.weight", + "elastic_state_head.decoder.reocclusion_head.0.bias", + "elastic_state_head.decoder.reocclusion_head.1.weight", + "elastic_state_head.decoder.reocclusion_head.1.bias", + "elastic_state_head.decoder.reocclusion_head.3.weight", + "elastic_state_head.decoder.reocclusion_head.3.bias", + "world_model.state_encoder.0.weight", + "world_model.state_encoder.0.bias", + "world_model.state_encoder.1.weight", + "world_model.state_encoder.1.bias", + "world_model.scene_memory_proj.0.weight", + "world_model.scene_memory_proj.0.bias", + "world_model.scene_memory_proj.1.weight", + "world_model.scene_memory_proj.1.bias", + "world_model.belief_memory_proj.0.weight", + "world_model.belief_memory_proj.0.bias", + "world_model.belief_memory_proj.1.weight", + "world_model.belief_memory_proj.1.bias", + "world_model.action_encoder.0.weight", + "world_model.action_encoder.0.bias", + "world_model.action_encoder.1.weight", + "world_model.action_encoder.1.bias", + "world_model.transition.weight_ih", + "world_model.transition.weight_hh", + "world_model.transition.bias_ih", + "world_model.transition.bias_hh", + "world_model.scene_memory_update.weight", + "world_model.scene_memory_update.bias", + "world_model.belief_memory_update.weight", + "world_model.belief_memory_update.bias", + "world_model.compact_decoder.weight", + "world_model.compact_decoder.bias", + "world_model.target_belief_head.weight", + "world_model.target_belief_head.bias", + "world_model.visibility_head.weight", + "world_model.visibility_head.bias", + "world_model.clearance_head.weight", + "world_model.clearance_head.bias", + "world_model.occluder_contact_head.weight", + "world_model.occluder_contact_head.bias", + "world_model.grasp_affordance_head.weight", + "world_model.grasp_affordance_head.bias", + "world_model.support_stability_head.weight", + "world_model.support_stability_head.bias", + "world_model.persistence_head.weight", + "world_model.persistence_head.bias", + "world_model.reocclusion_head.weight", + "world_model.reocclusion_head.bias", + "world_model.disturbance_head.weight", + "world_model.disturbance_head.bias", + "world_model.uncertainty_head.weight", + "world_model.uncertainty_head.bias", + "world_model.access_head.weight", + "world_model.access_head.bias", + "planner.residual.trunk.0.weight", + "planner.residual.trunk.0.bias", + "planner.residual.trunk.1.weight", + "planner.residual.trunk.1.bias", + "planner.residual.trunk.3.weight", + "planner.residual.trunk.3.bias", + "planner.residual.success_head.weight", + "planner.residual.success_head.bias", + "planner.residual.risk_head.weight", + "planner.residual.risk_head.bias", + "planner.residual.residual_head.weight", + "planner.residual.residual_head.bias" + ], + "unexpected_keys": [] + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..106d477e0c21a4bfde902673314a22e61b797ed1 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4583333333333333, + "bag_proxy": 0.5833333333333334, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5694444444444445, + "visibility_integral": 32.2005988392565, + "corridor_availability": 0.8664570152759552, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.1903364318709135, + "disturbance_cost": 0.35011103795841336 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..7a2f382b792c5bce2887645704df5a2e5222e4c2 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/checkpoint_best.pt +- mean_success: 0.569 +- visibility_integral: 32.201 +- corridor_availability: 0.866 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.190 +- disturbance_cost: 0.350 +- foliage_proxy_success: 0.458 +- bag_proxy_success: 0.583 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_planner/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_planner/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..08850e6f7b5edbd0adc7220b1e222d63ef26ca67 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_planner/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.5833333333333334, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5555555555555555, + "visibility_integral": 33.31703626612822, + "corridor_availability": 0.886079938047462, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.1836884579143008, + "disturbance_cost": 0.3696938648612963 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_planner/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_planner/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..b3e9c782e98fe444da6e45e40b38e06d7b52d108 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_planner/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/checkpoint_best.pt +- mean_success: 0.556 +- visibility_integral: 33.317 +- corridor_availability: 0.886 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.184 +- disturbance_cost: 0.370 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.583 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_role_symmetry/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_role_symmetry/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..3c2f8e1902fac34a5642a51fc0e6cf1d1a8e19ba --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_role_symmetry/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4583333333333333, + "bag_proxy": 0.5833333333333334, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5694444444444445, + "visibility_integral": 32.571378606888985, + "corridor_availability": 0.8744470203916231, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.249059588784357, + "disturbance_cost": 0.34120469799058306 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_role_symmetry/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_role_symmetry/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..65a295fda51ef2d3df990ae4028ce8ee3cad8aee --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/benchmark_no_role_symmetry/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/checkpoint_best.pt +- mean_success: 0.569 +- visibility_integral: 32.571 +- corridor_availability: 0.874 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.249 +- disturbance_cost: 0.341 +- foliage_proxy_success: 0.458 +- bag_proxy_success: 0.583 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..441cbf8ec8fa39123f486d3ba1787de5632aa000 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/config_resolved.yaml @@ -0,0 +1,147 @@ +experiment_name: proxy_interaction_r3d_stage1_dummy_seed13 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 13 +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 96 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_v6_rgbd_stage1_dummy_seed13.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_v6_rgbd_stage1_dummy_seed13.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 13 +optim: + epochs: 4 + batch_size: 16 + num_workers: 4 + lr: 0.001 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: false + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 192 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: true + fusion: + hidden_dim: 192 + num_cameras: 3 + num_layers: 2 + num_heads: 4 + ff_dim: 384 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 192 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 4 + max_history_steps: 8 + decoder: + hidden_dim: 192 + num_heads: 4 + num_layers: 2 + ff_dim: 384 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 192 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 4 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 192 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 4 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 192 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 4 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.15 + arm_role: 0.2 + support_mode: 0.15 + corridor: 0.2 + persistence: 0.1 + disturbance: 0.1 + world_model: 0.25 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.2 + planner_risk: 0.1 + planner_ranking: 0.1 + proposal_reconstruction: 0.2 + proposal_success: 0.1 + proposal_ranking: 0.1 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..3a14abed5ac92c7c5a742ae1ab28660371821f46 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.2595419847328244, + "planner_regret": 0.015185066498816013, + "planner_score_utility_spearman": 0.25190839171409607, + "risk_calibration_mse": 0.011332111433148384, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.02456846833229065, + "left_right_equivariance_error": 0.007538194466820534, + "belief_calibration_brier": 0.0055354926735162735, + "reocclusion_calibration_brier": 0.2274838089942932, + "support_stability_mae": 0.030257930979132652, + "clearance_auc": 0.7414014153848468, + "memory_write_rate": 0.0, + "memory_saturation": 0.7680174112319946, + "num_samples": 131 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..7e8ffdbfd0f3e9b6dd4e25065fe252303547d909 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.029530804604291916, + "arm_role": 0.19113596672893132, + "belief": 0.19201900158077478, + "clearance": 0.1937584774568677, + "corridor": 0.30155759242673713, + "disturbance": 0.018230090441647917, + "grasp_affordance": 0.1115249302238226, + "occluder_contact": 0.29577948339283466, + "persistence": 5.046393771966298, + "phase": 0.835017109910647, + "planner_ranking": 0.6733469751973947, + "planner_risk": 0.04033496890527507, + "planner_success": 0.6355331862966219, + "proposal_diversity": 0.0, + "proposal_ranking": 1.276770144701004, + "proposal_reconstruction": 0.07184042579804857, + "proposal_success": 0.6676094954212507, + "reocclusion": 0.6988904004295667, + "role_swap_consistency": 0.0006935761872834215, + "support_mode": 0.7387049297491709, + "support_stability": 0.22416748199611902, + "total": 2.4212693075339, + "uncertainty": 0.32931591259936493, + "visibility": 0.23356754829486212, + "world_model": 4.170340110858281 + }, + "val": { + "action": 0.023605089427696332, + "arm_role": 8.891185360779572e-05, + "belief": 0.112466166416804, + "clearance": 0.08774211009343465, + "corridor": 0.2502693798806932, + "disturbance": 0.0037313879001885653, + "grasp_affordance": 0.013532540657454066, + "occluder_contact": 0.2236137886842092, + "persistence": 4.796973652309841, + "phase": 0.6506193346447415, + "planner_ranking": 0.45240074396133423, + "planner_risk": 0.012336155710120996, + "planner_success": 0.6348234679963853, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1647081640031602, + "proposal_reconstruction": 0.06623147221075164, + "proposal_success": 0.6723773082097372, + "reocclusion": 0.6799028648270501, + "role_swap_consistency": 0.0, + "support_mode": 0.6129622724321153, + "support_stability": 0.14574629151158863, + "total": 1.9533665710025363, + "uncertainty": 0.057104989886283875, + "visibility": 0.09962501211298837, + "world_model": 3.08394538031684 + } + }, + { + "epoch": 1, + "train": { + "action": 0.02052135338696341, + "arm_role": 0.00010673219821910607, + "belief": 0.11743779480457306, + "clearance": 0.09043452050536871, + "corridor": 0.24632801488041878, + "disturbance": 0.003475519949764324, + "grasp_affordance": 0.01625332736875862, + "occluder_contact": 0.2240921917061011, + "persistence": 4.695922573407491, + "phase": 0.49508154888947803, + "planner_ranking": 0.14279444872712097, + "planner_risk": 0.0141817982463787, + "planner_success": 0.593176061908404, + "proposal_diversity": 0.0, + "proposal_ranking": 1.165678009390831, + "proposal_reconstruction": 0.06292749894782901, + "proposal_success": 0.674570898214976, + "reocclusion": 0.3844434078782797, + "role_swap_consistency": 0.00039524554207067314, + "support_mode": 0.17358588459561966, + "support_stability": 0.1374168156956633, + "total": 1.6440163105726242, + "uncertainty": 0.047071967429171004, + "visibility": 0.11256152174125116, + "world_model": 2.4736096411943436 + }, + "val": { + "action": 0.020492848422792222, + "arm_role": 0.0002776960156754487, + "belief": 0.1081986419028706, + "clearance": 0.08335375868611866, + "corridor": 0.24787565734651354, + "disturbance": 0.0022675159141524797, + "grasp_affordance": 0.012290253303945065, + "occluder_contact": 0.21959979832172394, + "persistence": 4.647055625915527, + "phase": 0.4316861795054542, + "planner_ranking": 0.06341143821676572, + "planner_risk": 0.015357115098999606, + "planner_success": 0.5689369605647193, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1283477942148845, + "proposal_reconstruction": 0.06308732968237665, + "proposal_success": 0.6809348861376444, + "reocclusion": 0.2748950504594379, + "role_swap_consistency": 0.0, + "support_mode": 0.0006280758987284369, + "support_stability": 0.14622381826241812, + "total": 1.6025353935029771, + "uncertainty": 0.02438033703300688, + "visibility": 0.10466726124286652, + "world_model": 2.558868553903368 + } + }, + { + "epoch": 2, + "train": { + "action": 0.01646478761297961, + "arm_role": 9.377782756322024e-05, + "belief": 0.10991635639220476, + "clearance": 0.0843405183404684, + "corridor": 0.2701566057900588, + "disturbance": 0.0031300995663817353, + "grasp_affordance": 0.012393822447241595, + "occluder_contact": 0.21479063170651594, + "persistence": 2.6339182580510774, + "phase": 0.431367311005791, + "planner_ranking": 0.06486702508603533, + "planner_risk": 0.013548698586722216, + "planner_success": 0.5643768397470316, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1353335281213124, + "proposal_reconstruction": 0.05951391921068231, + "proposal_success": 0.6731756230195364, + "reocclusion": 0.2623978331685066, + "role_swap_consistency": 0.00040521422973445925, + "support_mode": 0.000605581031171217, + "support_stability": 0.1400139912342032, + "total": 1.2923575937747955, + "uncertainty": 0.02004621450517637, + "visibility": 0.10328224146117766, + "world_model": 2.1331751296917596 + }, + "val": { + "action": 0.018090524814195104, + "arm_role": 4.204427063490989e-05, + "belief": 0.11348766502406862, + "clearance": 0.0778748012251324, + "corridor": 0.24816315703921848, + "disturbance": 0.0018734507805978258, + "grasp_affordance": 0.008446878753602505, + "occluder_contact": 0.2068953894906574, + "persistence": 1.9170836640728846, + "phase": 0.4777056227127711, + "planner_ranking": 0.07497243583202362, + "planner_risk": 0.012007931971715556, + "planner_success": 0.5846167008082072, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1227490504582722, + "proposal_reconstruction": 0.06178469873136944, + "proposal_success": 0.6768591006596884, + "reocclusion": 0.2698347626460923, + "role_swap_consistency": 0.0, + "support_mode": 0.0005942495643264718, + "support_stability": 0.14820611890819338, + "total": 1.2714158693949382, + "uncertainty": 0.004030831908393238, + "visibility": 0.09794799155659145, + "world_model": 2.303717931111654 + } + }, + { + "epoch": 3, + "train": { + "action": 0.015296258614398539, + "arm_role": 9.897743439069018e-05, + "belief": 0.10741911331812541, + "clearance": 0.07931565772742033, + "corridor": 0.23081608302891254, + "disturbance": 0.00287542298125724, + "grasp_affordance": 0.008955261165586611, + "occluder_contact": 0.21085621416568756, + "persistence": 1.6830786913633347, + "phase": 0.4407324629525344, + "planner_ranking": 0.053573422211532794, + "planner_risk": 0.011835894741428396, + "planner_success": 0.5389373525977135, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1375357458988826, + "proposal_reconstruction": 0.05875217309221625, + "proposal_success": 0.669308491051197, + "reocclusion": 0.26737124752253294, + "role_swap_consistency": 0.00044258072254403186, + "support_mode": 0.0058784369854644565, + "support_stability": 0.13682511821389198, + "total": 1.1672432621320088, + "uncertainty": 0.007140855586233859, + "visibility": 0.094703309237957, + "world_model": 2.072191367546717 + }, + "val": { + "action": 0.016218292733861342, + "arm_role": 0.00022501617463098632, + "belief": 0.10660513407654232, + "clearance": 0.07916852169566685, + "corridor": 0.23598399923907387, + "disturbance": 0.0013176489026389187, + "grasp_affordance": 0.009249631315469742, + "occluder_contact": 0.2084801279836231, + "persistence": 1.9978744321399264, + "phase": 0.46462951434983146, + "planner_ranking": 0.04140180618398719, + "planner_risk": 0.011076963868820004, + "planner_success": 0.5154120292928484, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1469912661446466, + "proposal_reconstruction": 0.05962582967347569, + "proposal_success": 0.6495795779758029, + "reocclusion": 0.2503652158710692, + "role_swap_consistency": 0.0, + "support_mode": 0.0004595977985041423, + "support_stability": 0.14600716531276703, + "total": 1.2128634585274591, + "uncertainty": 0.007759603775209851, + "visibility": 0.09225249456034766, + "world_model": 2.1404969029956393 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..0bcf8eccd89d9b325ac633686ea46db0f65b4fc9 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/summary.json @@ -0,0 +1,14 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage1_dummy_seed13", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed13/checkpoint_best.pt", + "final_train_total": 1.1672432621320088, + "final_val_total": 1.2128634585274591, + "train_time_sec": 18.091050624847412, + "peak_gpu_memory_mb": 631.1953125, + "num_train_samples": 380, + "num_val_samples": 131, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": null +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..e1e68216fdc4019115c6b7f17c7c5392df457bc6 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.625, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5694444444444445, + "visibility_integral": 32.801942747500206, + "corridor_availability": 0.8877548724412918, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 1.4711664057066363, + "disturbance_cost": 0.37882790300581193 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..9054a1385328752a45327544b4851a9bde36a967 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/checkpoint_best.pt +- mean_success: 0.569 +- visibility_integral: 32.802 +- corridor_availability: 0.888 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 1.471 +- disturbance_cost: 0.379 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.625 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_planner/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_planner/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..74ca2c61f4ec3939c696eed7007e9a865d6c211e --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_planner/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.5833333333333334, + "cloth_proxy": 0.625 + }, + "mean_success": 0.5416666666666666, + "visibility_integral": 34.428366212381256, + "corridor_availability": 0.8909231291876899, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 1.4917179537341767, + "disturbance_cost": 0.39409097459995085 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_planner/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_planner/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..b51fcb7f86b07b0fd9c750161ce3da0abaedebcc --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_planner/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/checkpoint_best.pt +- mean_success: 0.542 +- visibility_integral: 34.428 +- corridor_availability: 0.891 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 1.492 +- disturbance_cost: 0.394 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.583 +- cloth_proxy_success: 0.625 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_role_symmetry/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_role_symmetry/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..074c536d46ae8f64444e0ebd2a0258f013678675 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_role_symmetry/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.625, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5694444444444445, + "visibility_integral": 33.27109728753567, + "corridor_availability": 0.8943836614489555, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 1.488106187582016, + "disturbance_cost": 0.3667886131960485 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_role_symmetry/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_role_symmetry/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..e053392771c0bba70b22dd3b446d5815443573a4 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/benchmark_no_role_symmetry/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/checkpoint_best.pt +- mean_success: 0.569 +- visibility_integral: 33.271 +- corridor_availability: 0.894 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 1.488 +- disturbance_cost: 0.367 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.625 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..51fa0f37d7643addb961bc9cea308c2069430f10 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/config_resolved.yaml @@ -0,0 +1,147 @@ +experiment_name: proxy_interaction_r3d_stage1_dummy_seed14 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 14 +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 96 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_v6_rgbd_stage1_dummy_seed14.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_v6_rgbd_stage1_dummy_seed14.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 14 +optim: + epochs: 4 + batch_size: 16 + num_workers: 4 + lr: 0.001 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: false + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 192 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: true + fusion: + hidden_dim: 192 + num_cameras: 3 + num_layers: 2 + num_heads: 4 + ff_dim: 384 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 192 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 4 + max_history_steps: 8 + decoder: + hidden_dim: 192 + num_heads: 4 + num_layers: 2 + ff_dim: 384 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 192 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 4 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 192 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 4 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 192 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 4 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.15 + arm_role: 0.2 + support_mode: 0.15 + corridor: 0.2 + persistence: 0.1 + disturbance: 0.1 + world_model: 0.25 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.2 + planner_risk: 0.1 + planner_ranking: 0.1 + proposal_reconstruction: 0.2 + proposal_success: 0.1 + proposal_ranking: 0.1 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..ebb4765c1498984c08d3c8a93c1d27d350126cf0 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.2846153846153846, + "planner_regret": 0.014314642176032066, + "planner_score_utility_spearman": 0.2153846174478531, + "risk_calibration_mse": 0.010775926522910595, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.02589959278702736, + "left_right_equivariance_error": 0.008901518605211201, + "belief_calibration_brier": 0.005614265333861113, + "reocclusion_calibration_brier": 0.28406235575675964, + "support_stability_mae": 0.025872904807329178, + "clearance_auc": 0.5220335124994485, + "memory_write_rate": 0.0, + "memory_saturation": 0.7309081554412842, + "num_samples": 130 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..d96faaa0d4b76c519009bf3a1267c5281727e646 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.033738364155093827, + "arm_role": 0.2658534389071671, + "belief": 0.1663714082290729, + "clearance": 0.1995344152674079, + "corridor": 0.2937144724031289, + "disturbance": 0.01641949706633265, + "grasp_affordance": 0.07253360034277041, + "occluder_contact": 0.262634892637531, + "persistence": 5.348720759153366, + "phase": 0.9128680676221848, + "planner_ranking": 0.7161665211121241, + "planner_risk": 0.03542382351588458, + "planner_success": 0.6313644871115685, + "proposal_diversity": 0.0, + "proposal_ranking": 1.328845535715421, + "proposal_reconstruction": 0.07508338304857413, + "proposal_success": 0.6797524491945902, + "reocclusion": 0.7106639867027601, + "role_swap_consistency": 0.0008167610091428893, + "support_mode": 0.7801499888300896, + "support_stability": 0.21256058973570666, + "total": 2.46435983479023, + "uncertainty": 0.17734388983808458, + "visibility": 0.16707653552293777, + "world_model": 4.078198651472728 + }, + "val": { + "action": 0.023770140690935984, + "arm_role": 0.0004891494075612476, + "belief": 0.11787863655222787, + "clearance": 0.08211326102415721, + "corridor": 0.2646504044532776, + "disturbance": 0.0077974022262626225, + "grasp_affordance": 0.010528300681875812, + "occluder_contact": 0.23685429162449306, + "persistence": 4.643319712744819, + "phase": 0.6877350012461344, + "planner_ranking": 0.5576971173286438, + "planner_risk": 0.012001174760775434, + "planner_success": 0.6474077436659071, + "proposal_diversity": 0.0, + "proposal_ranking": 1.2473273674647014, + "proposal_reconstruction": 0.06659724977281359, + "proposal_success": 0.6868854032622443, + "reocclusion": 0.6894112494256761, + "role_swap_consistency": 0.0, + "support_mode": 0.7945182191001045, + "support_stability": 0.13977908922566307, + "total": 1.9791885084576077, + "uncertainty": 0.016744557561145887, + "visibility": 0.09745695524745518, + "world_model": 3.0115205181969538 + } + }, + { + "epoch": 1, + "train": { + "action": 0.02093995890269677, + "arm_role": 0.00021873527142209545, + "belief": 0.1156839697311322, + "clearance": 0.09139195084571838, + "corridor": 0.2529828678816557, + "disturbance": 0.003422619032789953, + "grasp_affordance": 0.017661277670413256, + "occluder_contact": 0.22792026090125242, + "persistence": 4.702208956082662, + "phase": 0.5312556164960066, + "planner_ranking": 0.20636002533137798, + "planner_risk": 0.015822513572250802, + "planner_success": 0.5910777151584625, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1696062982082367, + "proposal_reconstruction": 0.06334876082837582, + "proposal_success": 0.6708702544371287, + "reocclusion": 0.5039266211291155, + "role_swap_consistency": 0.0005020403975019386, + "support_mode": 0.3201311229883383, + "support_stability": 0.13968352818240723, + "total": 1.6841449290513992, + "uncertainty": 0.026018289965577424, + "visibility": 0.11011519034703572, + "world_model": 2.466151461005211 + }, + "val": { + "action": 0.020535202903880015, + "arm_role": 0.00012925987215971368, + "belief": 0.10588792545927896, + "clearance": 0.08000239895449744, + "corridor": 0.23227471278773415, + "disturbance": 0.0022439691221936503, + "grasp_affordance": 0.011653332453635003, + "occluder_contact": 0.21834516359700096, + "persistence": 4.46406364440918, + "phase": 0.4118766354189979, + "planner_ranking": 0.0892416491276688, + "planner_risk": 0.0152344209038549, + "planner_success": 0.6057713859611087, + "proposal_diversity": 0.0, + "proposal_ranking": 1.133669826719496, + "proposal_reconstruction": 0.06398758581942982, + "proposal_success": 0.6783458656734891, + "reocclusion": 0.2840655545393626, + "role_swap_consistency": 0.0, + "support_mode": 0.0015922162112676436, + "support_stability": 0.13890525698661804, + "total": 1.584020005332099, + "uncertainty": 0.014379701991048124, + "visibility": 0.09630187600851059, + "world_model": 2.5434003671010337 + } + }, + { + "epoch": 2, + "train": { + "action": 0.017165315182258684, + "arm_role": 0.00014243966719125942, + "belief": 0.1267746559654673, + "clearance": 0.09291451362272103, + "corridor": 0.2539026445398728, + "disturbance": 0.0040997157484525815, + "grasp_affordance": 0.016216314087311428, + "occluder_contact": 0.2287510900447766, + "persistence": 2.7297142073512077, + "phase": 0.4553527260820071, + "planner_ranking": 0.0675589762783299, + "planner_risk": 0.012244323831206808, + "planner_success": 0.5227356925606728, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1367994795242946, + "proposal_reconstruction": 0.06006583757698536, + "proposal_success": 0.6718559389313062, + "reocclusion": 0.28394716791808605, + "role_swap_consistency": 0.000532965175807476, + "support_mode": 0.0007756326898136953, + "support_stability": 0.14084124999741712, + "total": 1.2956190605958302, + "uncertainty": 0.011363255020114593, + "visibility": 0.11323032714426517, + "world_model": 2.120655362804731 + }, + "val": { + "action": 0.016470486712124612, + "arm_role": 0.00015339441274085807, + "belief": 0.15912896229161155, + "clearance": 0.07826702462302314, + "corridor": 0.21473425957891676, + "disturbance": 0.0018082650106710692, + "grasp_affordance": 0.008080463701238235, + "occluder_contact": 0.22728429403569964, + "persistence": 1.846471561325921, + "phase": 0.4164143088791106, + "planner_ranking": 0.05541756912134588, + "planner_risk": 0.011288604181673791, + "planner_success": 0.5237696303261651, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1311746835708618, + "proposal_reconstruction": 0.06064582823051347, + "proposal_success": 0.6669412983788384, + "reocclusion": 0.27248211950063705, + "role_swap_consistency": 0.0, + "support_mode": 0.00040661103816496, + "support_stability": 0.13817799753612942, + "total": 1.241025275654263, + "uncertainty": 0.003020187374204397, + "visibility": 0.11647009683979882, + "world_model": 2.323344442579481 + } + }, + { + "epoch": 3, + "train": { + "action": 0.015070427674800158, + "arm_role": 0.0002641689807205694, + "belief": 0.141230215318501, + "clearance": 0.07984113336230318, + "corridor": 0.225482989102602, + "disturbance": 0.0017908170169296984, + "grasp_affordance": 0.008550037746317685, + "occluder_contact": 0.21477928136785826, + "persistence": 1.6129546587665875, + "phase": 0.42590194568037987, + "planner_ranking": 0.04456973075866699, + "planner_risk": 0.010397601523436606, + "planner_success": 0.49412518242994946, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1504750202099483, + "proposal_reconstruction": 0.058567725432415806, + "proposal_success": 0.6462936575214068, + "reocclusion": 0.2506879176944494, + "role_swap_consistency": 0.000550856914439161, + "support_mode": 0.0003065853112881693, + "support_stability": 0.1366732595488429, + "total": 1.134415107468764, + "uncertainty": 0.0035936666245106608, + "visibility": 0.10351777387162049, + "world_model": 2.024999057253202 + }, + "val": { + "action": 0.016186242405739095, + "arm_role": 0.0002410423346898622, + "belief": 0.12203978498776753, + "clearance": 0.07702170064051946, + "corridor": 0.21113747523890602, + "disturbance": 0.0014993647216922706, + "grasp_affordance": 0.008119617278377214, + "occluder_contact": 0.21474246515168083, + "persistence": 1.9725701610247295, + "phase": 0.4842751953336928, + "planner_ranking": 0.04342265882425838, + "planner_risk": 0.01107009764139851, + "planner_success": 0.5070097777578566, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1282474862204657, + "proposal_reconstruction": 0.05997827731900745, + "proposal_success": 0.6469291316138374, + "reocclusion": 0.2716698878341251, + "role_swap_consistency": 0.0, + "support_mode": 0.00020467836778455725, + "support_stability": 0.13836157073577246, + "total": 1.2091523673799303, + "uncertainty": 0.0025335378272251952, + "visibility": 0.09879730641841888, + "world_model": 2.1507359743118286 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..a85631cd9ae07a289d7e835a0f3a9e72081231f9 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/summary.json @@ -0,0 +1,14 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage1_dummy_seed14", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed14/checkpoint_best.pt", + "final_train_total": 1.134415107468764, + "final_val_total": 1.2091523673799303, + "train_time_sec": 23.220722675323486, + "peak_gpu_memory_mb": 626.4716796875, + "num_train_samples": 381, + "num_val_samples": 130, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": null +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..69244ac387c2a97bbcc7fdcc5b8d12e93d00c799 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4583333333333333, + "bag_proxy": 0.625, + "cloth_proxy": 0.7083333333333334 + }, + "mean_success": 0.5972222222222222, + "visibility_integral": 29.697570121950573, + "corridor_availability": 0.8675610861844487, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.20430763148842, + "disturbance_cost": 0.36563710583787823 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..72c996af666245b0dccd85479c7d965290d910fc --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/checkpoint_best.pt +- mean_success: 0.597 +- visibility_integral: 29.698 +- corridor_availability: 0.868 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.204 +- disturbance_cost: 0.366 +- foliage_proxy_success: 0.458 +- bag_proxy_success: 0.625 +- cloth_proxy_success: 0.708 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_planner/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_planner/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..69244ac387c2a97bbcc7fdcc5b8d12e93d00c799 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_planner/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4583333333333333, + "bag_proxy": 0.625, + "cloth_proxy": 0.7083333333333334 + }, + "mean_success": 0.5972222222222222, + "visibility_integral": 29.697570121950573, + "corridor_availability": 0.8675610861844487, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.20430763148842, + "disturbance_cost": 0.36563710583787823 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_planner/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_planner/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..72c996af666245b0dccd85479c7d965290d910fc --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_planner/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/checkpoint_best.pt +- mean_success: 0.597 +- visibility_integral: 29.698 +- corridor_availability: 0.868 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.204 +- disturbance_cost: 0.366 +- foliage_proxy_success: 0.458 +- bag_proxy_success: 0.625 +- cloth_proxy_success: 0.708 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_role_symmetry/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_role_symmetry/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..9e13b53c7b2c4dbbb2ad8a48a58cc8a3f0100ff8 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_role_symmetry/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.5, + "bag_proxy": 0.625, + "cloth_proxy": 0.7083333333333334 + }, + "mean_success": 0.6111111111111112, + "visibility_integral": 28.954636810554398, + "corridor_availability": 0.8660841253068712, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.10539705814184, + "disturbance_cost": 0.35598844579524463 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_role_symmetry/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_role_symmetry/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..3654a1eaa96e4118f0eb8ae904a2a3349f87ad7c --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/benchmark_no_role_symmetry/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/checkpoint_best.pt +- mean_success: 0.611 +- visibility_integral: 28.955 +- corridor_availability: 0.866 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.105 +- disturbance_cost: 0.356 +- foliage_proxy_success: 0.500 +- bag_proxy_success: 0.625 +- cloth_proxy_success: 0.708 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bf93049674d59f6ad7937203233ea51c2cdbbaed --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/config_resolved.yaml @@ -0,0 +1,147 @@ +experiment_name: proxy_interaction_r3d_stage1_dummy_seed15 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 15 +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 96 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_v6_rgbd_stage1_dummy_seed15.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_v6_rgbd_stage1_dummy_seed15.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 15 +optim: + epochs: 4 + batch_size: 16 + num_workers: 4 + lr: 0.001 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: false + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 192 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: true + fusion: + hidden_dim: 192 + num_cameras: 3 + num_layers: 2 + num_heads: 4 + ff_dim: 384 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 192 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 4 + max_history_steps: 8 + decoder: + hidden_dim: 192 + num_heads: 4 + num_layers: 2 + ff_dim: 384 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 192 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 4 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 192 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 4 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 192 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 4 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.15 + arm_role: 0.2 + support_mode: 0.15 + corridor: 0.2 + persistence: 0.1 + disturbance: 0.1 + world_model: 0.25 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.2 + planner_risk: 0.1 + planner_ranking: 0.1 + proposal_reconstruction: 0.2 + proposal_success: 0.1 + proposal_ranking: 0.1 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..3f3e0d068805287846e0dd0e829c2e79fce92c83 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.3053435114503817, + "planner_regret": 0.013406210578978062, + "planner_score_utility_spearman": 0.2839694619178772, + "risk_calibration_mse": 0.010891024023294449, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.02313310280442238, + "left_right_equivariance_error": 0.006598936667775407, + "belief_calibration_brier": 0.00368268764577806, + "reocclusion_calibration_brier": 0.2288682460784912, + "support_stability_mae": 0.025202222168445587, + "clearance_auc": 0.9189163634555108, + "memory_write_rate": 0.0, + "memory_saturation": 0.8174758553504944, + "num_samples": 131 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..f4ed296c1271db4ab0afdb9b80f79aa9904ffae6 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.028008008919035394, + "arm_role": 0.2316993211661611, + "belief": 0.21131388066957393, + "clearance": 0.19917472638189793, + "corridor": 0.3046618662774563, + "disturbance": 0.020259966540227953, + "grasp_affordance": 0.15939014249791703, + "occluder_contact": 0.3023037730405728, + "persistence": 5.1030773023764295, + "phase": 0.7391876379648844, + "planner_ranking": 0.6672491803765297, + "planner_risk": 0.035407664448333286, + "planner_success": 0.6247484882672628, + "proposal_diversity": 0.0, + "proposal_ranking": 1.2685468345880508, + "proposal_reconstruction": 0.07012522220611572, + "proposal_success": 0.6749546950062116, + "reocclusion": 0.6581779879828294, + "role_swap_consistency": 0.0007787000698347887, + "support_mode": 0.6318444466839234, + "support_stability": 0.21354713415106139, + "total": 2.377249076962471, + "uncertainty": 0.2297215286331872, + "visibility": 0.20075704219440618, + "world_model": 4.083281387885411 + }, + "val": { + "action": 0.023762268117732473, + "arm_role": 0.00020197388787184737, + "belief": 0.1366901993751526, + "clearance": 0.10309203879700767, + "corridor": 0.26862603922684986, + "disturbance": 0.0037259276594138807, + "grasp_affordance": 0.044725324544641704, + "occluder_contact": 0.2536553243796031, + "persistence": 4.777863184611003, + "phase": 0.5066013468636407, + "planner_ranking": 0.44456031918525696, + "planner_risk": 0.01433694911085897, + "planner_success": 0.6283807026015388, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1667029857635498, + "proposal_reconstruction": 0.0664608735177252, + "proposal_success": 0.6838224861356947, + "reocclusion": 0.3364369339413113, + "role_swap_consistency": 0.0, + "support_mode": 0.06715444227059682, + "support_stability": 0.14777708219157326, + "total": 1.8394301467471652, + "uncertainty": 0.07208604945076837, + "visibility": 0.12188677820894453, + "world_model": 3.079341014226278 + } + }, + { + "epoch": 1, + "train": { + "action": 0.018888041842728853, + "arm_role": 0.00043030476990679745, + "belief": 0.11719414374480645, + "clearance": 0.08535642797748248, + "corridor": 0.24796467771132788, + "disturbance": 0.0024048478032151857, + "grasp_affordance": 0.022171703943361838, + "occluder_contact": 0.22088239962855974, + "persistence": 4.555501798788707, + "phase": 0.43327916599810123, + "planner_ranking": 0.15463371171305576, + "planner_risk": 0.01981719226265947, + "planner_success": 0.5631782834728559, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1632012923558552, + "proposal_reconstruction": 0.0615519261918962, + "proposal_success": 0.6722564473748207, + "reocclusion": 0.287830734004577, + "role_swap_consistency": 0.00048373279059887864, + "support_mode": 0.008119381836574272, + "support_stability": 0.13662359025329351, + "total": 1.567106415828069, + "uncertainty": 0.03243653344300886, + "visibility": 0.11203592922538519, + "world_model": 2.404594744245211 + }, + "val": { + "action": 0.019907095055613253, + "arm_role": 0.00038116834993060265, + "belief": 0.1014507081773546, + "clearance": 0.07728531956672668, + "corridor": 0.22947043677171072, + "disturbance": 0.0014698771928023133, + "grasp_affordance": 0.02056772096289529, + "occluder_contact": 0.20453951425022548, + "persistence": 3.6124378045399985, + "phase": 0.47070127063327366, + "planner_ranking": 0.08099263947870997, + "planner_risk": 0.017360565563042957, + "planner_success": 0.5593770245711008, + "proposal_diversity": 0.0, + "proposal_ranking": 1.11685311794281, + "proposal_reconstruction": 0.0633203275501728, + "proposal_success": 0.683642049630483, + "reocclusion": 0.42518342865837944, + "role_swap_consistency": 0.0, + "support_mode": 8.963614042537908e-05, + "support_stability": 0.1495772964424557, + "total": 1.5412384668986003, + "uncertainty": 0.024036270876725514, + "visibility": 0.10443270951509476, + "world_model": 2.6981404887305365 + } + }, + { + "epoch": 2, + "train": { + "action": 0.01506453799083829, + "arm_role": 0.0002299571582019174, + "belief": 0.10169448765615623, + "clearance": 0.08062320730338494, + "corridor": 0.23694788571447134, + "disturbance": 0.002010827219540564, + "grasp_affordance": 0.012944541425288966, + "occluder_contact": 0.20663638102511564, + "persistence": 2.024513818323612, + "phase": 0.4406547602266073, + "planner_ranking": 0.052334820929293834, + "planner_risk": 0.012688904457415143, + "planner_success": 0.4998842130104701, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1411344707012177, + "proposal_reconstruction": 0.058503514621406794, + "proposal_success": 0.663138655324777, + "reocclusion": 0.28770653810352087, + "role_swap_consistency": 0.0005917157322983257, + "support_mode": 0.00027886544603461516, + "support_stability": 0.14369840795795122, + "total": 1.2098931844035785, + "uncertainty": 0.009047253523021936, + "visibility": 0.09652530650297801, + "world_model": 2.1335272987683616 + }, + "val": { + "action": 0.0173407852028807, + "arm_role": 0.00028451886545452807, + "belief": 0.09623022625843684, + "clearance": 0.07612819969654083, + "corridor": 0.22281885809368557, + "disturbance": 0.001401680282368842, + "grasp_affordance": 0.00781761777276794, + "occluder_contact": 0.20622349116537306, + "persistence": 2.1598196625709534, + "phase": 0.47410638795958626, + "planner_ranking": 0.0378283916765617, + "planner_risk": 0.013348096515983343, + "planner_success": 0.4943488637606303, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1125682062572904, + "proposal_reconstruction": 0.06057575262255139, + "proposal_success": 0.6509590811199613, + "reocclusion": 0.2778696550263299, + "role_swap_consistency": 0.0, + "support_mode": 7.348006571798275e-05, + "support_stability": 0.14099042697085273, + "total": 1.2928278247515361, + "uncertainty": 0.0023198039270937443, + "visibility": 0.08993011878596412, + "world_model": 2.425517029232449 + } + }, + { + "epoch": 3, + "train": { + "action": 0.015032132350218793, + "arm_role": 0.00015960596041016592, + "belief": 0.10330141056329012, + "clearance": 0.0756644958940645, + "corridor": 0.22099452962478003, + "disturbance": 0.0017974149668589234, + "grasp_affordance": 0.008848114540645232, + "occluder_contact": 0.20204609570403895, + "persistence": 1.6058371538917224, + "phase": 0.42861080542206764, + "planner_ranking": 0.040083787171170115, + "planner_risk": 0.010861996522483727, + "planner_success": 0.48133989547689754, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1467161824305852, + "proposal_reconstruction": 0.058588774874806404, + "proposal_success": 0.6429290076096853, + "reocclusion": 0.24268781704207262, + "role_swap_consistency": 0.00047596763154918637, + "support_mode": 2.783346417345456e-05, + "support_stability": 0.1325785775358478, + "total": 1.1217727214097977, + "uncertainty": 0.003058687725570053, + "visibility": 0.09524129331111908, + "world_model": 2.0093316386143365 + }, + "val": { + "action": 0.016727436126934156, + "arm_role": 0.0002483524456490866, + "belief": 0.09281252986854976, + "clearance": 0.0730266264743275, + "corridor": 0.22520612014664543, + "disturbance": 0.0031746443160550874, + "grasp_affordance": 0.00780139294349485, + "occluder_contact": 0.20420674648549822, + "persistence": 1.9897065493795607, + "phase": 0.42935120397143894, + "planner_ranking": 0.03520135974718465, + "planner_risk": 0.012488630910714468, + "planner_success": 0.5116605394416385, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1216257943047419, + "proposal_reconstruction": 0.05996803608205584, + "proposal_success": 0.6389667987823486, + "reocclusion": 0.26481906490193474, + "role_swap_consistency": 0.0, + "support_mode": 4.154515813247094e-05, + "support_stability": 0.13968953986962637, + "total": 1.1943119830555387, + "uncertainty": 0.0017189466937755544, + "visibility": 0.09683923174937566, + "world_model": 2.1186628209220038 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..8562671a18b0e5c660f58f3a74d5286b1226c769 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/summary.json @@ -0,0 +1,14 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage1_dummy_seed15", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage1_dummy_seed15/checkpoint_best.pt", + "final_train_total": 1.1217727214097977, + "final_val_total": 1.1943119830555387, + "train_time_sec": 20.030457735061646, + "peak_gpu_memory_mb": 631.1953125, + "num_train_samples": 380, + "num_val_samples": 131, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": null +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..53a27f4620326c925d07671745709e4e89c0a46f --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.5833333333333334, + "cloth_proxy": 0.625 + }, + "mean_success": 0.5416666666666666, + "visibility_integral": 34.34427807728449, + "corridor_availability": 0.893132723040051, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.3119179729333856, + "disturbance_cost": 0.39262517919350004 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..673ea758ca9473f6de04fb0a1244b42348b11b40 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/checkpoint_best.pt +- mean_success: 0.542 +- visibility_integral: 34.344 +- corridor_availability: 0.893 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.312 +- disturbance_cost: 0.393 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.583 +- cloth_proxy_success: 0.625 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8259a4b910eccd10954ce134823179a566fdb9f --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/config_resolved.yaml @@ -0,0 +1,149 @@ +experiment_name: proxy_interaction_r3d_stage2_clip_seed11 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 11 +init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt +init_strict: false +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 224 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage2_seed11.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage2_seed11.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 11 +optim: + epochs: 4 + batch_size: 2 + num_workers: 4 + lr: 0.0003 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: true + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 512 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: false + fusion: + hidden_dim: 512 + num_cameras: 3 + num_layers: 4 + num_heads: 8 + ff_dim: 2048 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 512 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 8 + max_history_steps: 8 + decoder: + hidden_dim: 512 + num_heads: 8 + num_layers: 4 + ff_dim: 2048 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 512 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 8 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 512 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 8 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 512 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 8 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.1 + arm_role: 0.15 + support_mode: 0.1 + corridor: 0.15 + persistence: 0.05 + disturbance: 0.05 + world_model: 0.25 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.25 + planner_risk: 0.1 + planner_ranking: 0.2 + proposal_reconstruction: 0.1 + proposal_success: 0.15 + proposal_ranking: 0.2 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..536a3a6ecf37e15b70651b86137c6fc96616f8b6 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.27906976744186046, + "planner_regret": 0.014687228947877884, + "planner_score_utility_spearman": 0.210852712392807, + "risk_calibration_mse": 0.00986906886100769, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.01944497972726822, + "left_right_equivariance_error": 0.0002826815475462795, + "belief_calibration_brier": 0.003809324698522687, + "reocclusion_calibration_brier": 0.28801918029785156, + "support_stability_mae": 0.026344481855630875, + "clearance_auc": 0.9058322298594268, + "memory_write_rate": 0.0, + "memory_saturation": 0.5182730555534363, + "num_samples": 129 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..058ccc17a66721e9e9f7b9b357c61a5ef53f9916 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.026644286036452386, + "arm_role": 0.024380755674152474, + "belief": 0.1216605089955929, + "clearance": 0.09435067850491763, + "corridor": 0.29937174982581466, + "disturbance": 0.00554025236528562, + "grasp_affordance": 0.02358881158130097, + "occluder_contact": 0.2092289766247984, + "persistence": 6.1897567423750885, + "phase": 0.7662794502617801, + "planner_ranking": 0.16281673026756807, + "planner_risk": 0.014868872865537152, + "planner_success": 0.6131215223467162, + "proposal_diversity": 0.0, + "proposal_ranking": 1.24531464707789, + "proposal_reconstruction": 0.06868474318094903, + "proposal_success": 0.6827385360033724, + "reocclusion": 0.7132243294054301, + "role_swap_consistency": 0.0003943645942521023, + "support_mode": 0.7494733720549738, + "support_stability": 0.1631323242406434, + "total": 1.8074374061604446, + "uncertainty": 0.02646746405007053, + "visibility": 0.11232841992019359, + "world_model": 2.612228818900922 + }, + "val": { + "action": 0.021447077637108472, + "arm_role": 2.0711024318678448e-05, + "belief": 0.09341082458312695, + "clearance": 0.07425147117330477, + "corridor": 0.23059940796632034, + "disturbance": 0.002393470596031805, + "grasp_affordance": 0.011041764040979056, + "occluder_contact": 0.18809776099828573, + "persistence": 4.780190816292396, + "phase": 0.6694326795064486, + "planner_ranking": 0.045178403781476216, + "planner_risk": 0.010466235164158906, + "planner_success": 0.5507269249512599, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1411677369704614, + "proposal_reconstruction": 0.06366521647343269, + "proposal_success": 0.6671431862390959, + "reocclusion": 0.6940084649966314, + "role_swap_consistency": 0.0, + "support_mode": 0.6672373189375951, + "support_stability": 0.1576275432912203, + "total": 1.518847630574153, + "uncertainty": 0.007903887732670858, + "visibility": 0.08643374764002286, + "world_model": 2.1690124484208915 + } + }, + { + "epoch": 1, + "train": { + "action": 0.022044192911832745, + "arm_role": 6.96019976550996e-05, + "belief": 0.1076193918810465, + "clearance": 0.08718149573664079, + "corridor": 0.25136479897746394, + "disturbance": 0.0036231905150284236, + "grasp_affordance": 0.013151204869326652, + "occluder_contact": 0.20517516619872048, + "persistence": 3.9877223619186752, + "phase": 0.6843610034563155, + "planner_ranking": 0.052667735511481836, + "planner_risk": 0.011696826657908116, + "planner_success": 0.5557226944344206, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1597779933070638, + "proposal_reconstruction": 0.06451685880725297, + "proposal_success": 0.6708246469497681, + "reocclusion": 0.6674874808775817, + "role_swap_consistency": 0.00026557610019144277, + "support_mode": 0.6283252975703534, + "support_stability": 0.14863310884976885, + "total": 1.4848913787547207, + "uncertainty": 0.006362306834499096, + "visibility": 0.1025782693665065, + "world_model": 2.1518520416389584 + }, + "val": { + "action": 0.01952767800539732, + "arm_role": 0.00011286609648882753, + "belief": 0.11118833353886237, + "clearance": 0.08315109071823266, + "corridor": 0.44410995245069407, + "disturbance": 0.004647846037127797, + "grasp_affordance": 0.011384243833330962, + "occluder_contact": 0.19399810410462892, + "persistence": 7.307789671675374, + "phase": 0.5388953167658586, + "planner_ranking": 0.03512171468243581, + "planner_risk": 0.010607366236105848, + "planner_success": 0.5378215003472108, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1689154845017653, + "proposal_reconstruction": 0.06193383215711667, + "proposal_success": 0.6688321847182054, + "reocclusion": 0.6636646819802431, + "role_swap_consistency": 0.0, + "support_mode": 1.039346590408912, + "support_stability": 0.15784503956540272, + "total": 1.6776308609889103, + "uncertainty": 1.152622356900023e-05, + "visibility": 0.11231438769743993, + "world_model": 2.0740180052243744 + } + }, + { + "epoch": 2, + "train": { + "action": 0.018700552844632592, + "arm_role": 0.000972873253347986, + "belief": 0.12275376962741633, + "clearance": 0.08487847380593654, + "corridor": 0.24357045909599523, + "disturbance": 0.00331472009285923, + "grasp_affordance": 0.01026101550818738, + "occluder_contact": 0.2153189008304586, + "persistence": 2.4059808037708565, + "phase": 0.5033158507022558, + "planner_ranking": 0.03657240937522146, + "planner_risk": 0.010864751256517188, + "planner_success": 0.5124278418836793, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1595254449944221, + "proposal_reconstruction": 0.06179310919726706, + "proposal_success": 0.6566679927066983, + "reocclusion": 0.38847498592423246, + "role_swap_consistency": 0.0008652656654451233, + "support_mode": 0.35570250506176376, + "support_stability": 0.14306114877315715, + "total": 1.2949361607666414, + "uncertainty": 0.0021972638202774016, + "visibility": 0.11033565828001311, + "world_model": 2.0251020854680326 + }, + "val": { + "action": 0.02487506473150391, + "arm_role": 2.681288724095164e-06, + "belief": 0.1081794464817414, + "clearance": 0.07636868116947321, + "corridor": 0.20140686992269297, + "disturbance": 0.001994377507043492, + "grasp_affordance": 0.008677966799587012, + "occluder_contact": 0.21161039288227373, + "persistence": 1.3219125701257817, + "phase": 0.37710464734297533, + "planner_ranking": 0.03206984894719566, + "planner_risk": 0.011674165392581088, + "planner_success": 0.46599124119831964, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1611316717587985, + "proposal_reconstruction": 0.06948714015575555, + "proposal_success": 0.6533986550111037, + "reocclusion": 0.31326579468754623, + "role_swap_consistency": 0.0, + "support_mode": 0.17989734836066, + "support_stability": 0.13907468725855535, + "total": 1.1672267354451693, + "uncertainty": 0.0006145757817158972, + "visibility": 0.09655883323687774, + "world_model": 1.9251508355140685 + } + }, + { + "epoch": 3, + "train": { + "action": 0.01569180575160417, + "arm_role": 0.027261029675368864, + "belief": 0.1110531479042238, + "clearance": 0.07751915099694155, + "corridor": 0.20897386773052554, + "disturbance": 0.0019330896785874818, + "grasp_affordance": 0.008865814846184553, + "occluder_contact": 0.2038286757406764, + "persistence": 1.1690228903273747, + "phase": 0.3392091920862647, + "planner_ranking": 0.03543819029409765, + "planner_risk": 0.010516670346321021, + "planner_success": 0.48809501739384614, + "proposal_diversity": 0.0, + "proposal_ranking": 1.147950800613583, + "proposal_reconstruction": 0.05924179073403643, + "proposal_success": 0.6485388935860539, + "reocclusion": 0.25005930125791365, + "role_swap_consistency": 0.001582450235418851, + "support_mode": 0.1954826559695898, + "support_stability": 0.13409689854811, + "total": 1.056747386592845, + "uncertainty": 0.0003292631887740548, + "visibility": 0.10012162002827485, + "world_model": 1.5451418317425314 + }, + "val": { + "action": 0.012563670304818796, + "arm_role": 9.379507576667834e-05, + "belief": 0.09966908166041741, + "clearance": 0.07572867818749868, + "corridor": 0.19870975395807852, + "disturbance": 0.0011806640476536884, + "grasp_affordance": 0.00889887514595802, + "occluder_contact": 0.20167340773802536, + "persistence": 0.8940682159306911, + "phase": 0.2346100378781557, + "planner_ranking": 0.04236364569671353, + "planner_risk": 0.009735576174884604, + "planner_success": 0.4702391225558061, + "proposal_diversity": 0.0, + "proposal_ranking": 1.122354874244103, + "proposal_reconstruction": 0.056834035424085765, + "proposal_success": 0.6365870714187623, + "reocclusion": 0.21859066887543752, + "role_swap_consistency": 0.0, + "support_mode": 0.10797913265056334, + "support_stability": 0.1366611572698905, + "total": 0.9305079854451693, + "uncertainty": 0.0003750753218460327, + "visibility": 0.09423433232765932, + "world_model": 1.2587093903468205 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..12df1ebd0478acadd8ee7d73209f26b8fecc1fc7 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/summary.json @@ -0,0 +1,557 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage2_clip_seed11", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed11/checkpoint_best.pt", + "final_train_total": 1.056747386592845, + "final_val_total": 0.9305079854451693, + "train_time_sec": 131.29005098342896, + "peak_gpu_memory_mb": 1894.7548828125, + "num_train_samples": 382, + "num_val_samples": 129, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": { + "path": "/workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt", + "loaded_keys": 461, + "skipped_shape_mismatch_keys": [ + "memory.gru.weight_ih_l0", + "memory.gru.weight_hh_l0", + "memory.gru.bias_ih_l0", + "memory.gru.bias_hh_l0", + "memory.token_proj.0.weight", + "memory.token_proj.0.bias", + "memory.token_proj.1.weight", + "memory.token_proj.1.bias", + "decoder.actor_role_bias", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.linear1.weight", + "decoder.revealer_decoder.layers.0.linear1.bias", + "decoder.revealer_decoder.layers.0.linear2.weight", + "decoder.revealer_decoder.layers.0.linear2.bias", + "decoder.revealer_decoder.layers.0.norm1.weight", + "decoder.revealer_decoder.layers.0.norm1.bias", + "decoder.revealer_decoder.layers.0.norm2.weight", + "decoder.revealer_decoder.layers.0.norm2.bias", + "decoder.revealer_decoder.layers.0.norm3.weight", + "decoder.revealer_decoder.layers.0.norm3.bias", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.linear1.weight", + "decoder.revealer_decoder.layers.1.linear1.bias", + "decoder.revealer_decoder.layers.1.linear2.weight", + "decoder.revealer_decoder.layers.1.linear2.bias", + "decoder.revealer_decoder.layers.1.norm1.weight", + "decoder.revealer_decoder.layers.1.norm1.bias", + "decoder.revealer_decoder.layers.1.norm2.weight", + "decoder.revealer_decoder.layers.1.norm2.bias", + "decoder.revealer_decoder.layers.1.norm3.weight", + "decoder.revealer_decoder.layers.1.norm3.bias", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.linear1.weight", + "decoder.revealer_decoder.layers.2.linear1.bias", + "decoder.revealer_decoder.layers.2.linear2.weight", + "decoder.revealer_decoder.layers.2.linear2.bias", + "decoder.revealer_decoder.layers.2.norm1.weight", + "decoder.revealer_decoder.layers.2.norm1.bias", + "decoder.revealer_decoder.layers.2.norm2.weight", + "decoder.revealer_decoder.layers.2.norm2.bias", + "decoder.revealer_decoder.layers.2.norm3.weight", + "decoder.revealer_decoder.layers.2.norm3.bias", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.linear1.weight", + "decoder.revealer_decoder.layers.3.linear1.bias", + "decoder.revealer_decoder.layers.3.linear2.weight", + "decoder.revealer_decoder.layers.3.linear2.bias", + "decoder.revealer_decoder.layers.3.norm1.weight", + "decoder.revealer_decoder.layers.3.norm1.bias", + "decoder.revealer_decoder.layers.3.norm2.weight", + "decoder.revealer_decoder.layers.3.norm2.bias", + "decoder.revealer_decoder.layers.3.norm3.weight", + "decoder.revealer_decoder.layers.3.norm3.bias", + "decoder.actor_decoder.layers.0.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.linear1.weight", + "decoder.actor_decoder.layers.0.linear1.bias", + "decoder.actor_decoder.layers.0.linear2.weight", + "decoder.actor_decoder.layers.0.linear2.bias", + "decoder.actor_decoder.layers.0.norm1.weight", + "decoder.actor_decoder.layers.0.norm1.bias", + "decoder.actor_decoder.layers.0.norm2.weight", + "decoder.actor_decoder.layers.0.norm2.bias", + "decoder.actor_decoder.layers.0.norm3.weight", + "decoder.actor_decoder.layers.0.norm3.bias", + "decoder.actor_decoder.layers.1.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.linear1.weight", + "decoder.actor_decoder.layers.1.linear1.bias", + "decoder.actor_decoder.layers.1.linear2.weight", + "decoder.actor_decoder.layers.1.linear2.bias", + "decoder.actor_decoder.layers.1.norm1.weight", + "decoder.actor_decoder.layers.1.norm1.bias", + "decoder.actor_decoder.layers.1.norm2.weight", + "decoder.actor_decoder.layers.1.norm2.bias", + "decoder.actor_decoder.layers.1.norm3.weight", + "decoder.actor_decoder.layers.1.norm3.bias", + "decoder.actor_decoder.layers.2.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.linear1.weight", + "decoder.actor_decoder.layers.2.linear1.bias", + "decoder.actor_decoder.layers.2.linear2.weight", + "decoder.actor_decoder.layers.2.linear2.bias", + "decoder.actor_decoder.layers.2.norm1.weight", + "decoder.actor_decoder.layers.2.norm1.bias", + "decoder.actor_decoder.layers.2.norm2.weight", + "decoder.actor_decoder.layers.2.norm2.bias", + "decoder.actor_decoder.layers.2.norm3.weight", + "decoder.actor_decoder.layers.2.norm3.bias", + "decoder.actor_decoder.layers.3.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.linear1.weight", + "decoder.actor_decoder.layers.3.linear1.bias", + "decoder.actor_decoder.layers.3.linear2.weight", + "decoder.actor_decoder.layers.3.linear2.bias", + "decoder.actor_decoder.layers.3.norm1.weight", + "decoder.actor_decoder.layers.3.norm1.bias", + "decoder.actor_decoder.layers.3.norm2.weight", + "decoder.actor_decoder.layers.3.norm2.bias", + "decoder.actor_decoder.layers.3.norm3.weight", + "decoder.actor_decoder.layers.3.norm3.bias", + "decoder.revealer_mean.weight", + "decoder.revealer_mean.bias", + "decoder.revealer_log_std.weight", + "decoder.revealer_log_std.bias", + "decoder.actor_mean.weight", + "decoder.actor_mean.bias", + "decoder.actor_log_std.weight", + "decoder.actor_log_std.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias" + ], + "missing_keys": [ + "backbone.depth_adapter.depth_proj.0.weight", + "backbone.depth_adapter.depth_proj.0.bias", + "backbone.depth_adapter.depth_proj.1.weight", + "backbone.depth_adapter.depth_proj.1.bias", + "backbone.depth_adapter.depth_proj.3.weight", + "backbone.depth_adapter.depth_proj.3.bias", + "backbone.depth_adapter.geometry_proj.0.weight", + "backbone.depth_adapter.geometry_proj.0.bias", + "backbone.depth_adapter.geometry_proj.1.weight", + "backbone.depth_adapter.geometry_proj.1.bias", + "backbone.depth_adapter.camera_proj.0.weight", + "backbone.depth_adapter.camera_proj.0.bias", + "backbone.depth_adapter.camera_proj.1.weight", + "backbone.depth_adapter.camera_proj.1.bias", + "fusion.geometry_fusion.attn.in_proj_weight", + "fusion.geometry_fusion.attn.in_proj_bias", + "fusion.geometry_fusion.attn.out_proj.weight", + "fusion.geometry_fusion.attn.out_proj.bias", + "fusion.geometry_fusion.gate.0.weight", + "fusion.geometry_fusion.gate.0.bias", + "fusion.geometry_fusion.gate.1.weight", + "fusion.geometry_fusion.gate.1.bias", + "fusion.geometry_fusion.gate.3.weight", + "fusion.geometry_fusion.gate.3.bias", + "fusion.geometry_fusion.out.0.weight", + "fusion.geometry_fusion.out.0.bias", + "fusion.geometry_fusion.out.1.weight", + "fusion.geometry_fusion.out.1.bias", + "memory.scene_memory.position_embedding", + "memory.scene_memory.bank_queries", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear1.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear1.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear2.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear2.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm1.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm1.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm2.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm2.bias", + "memory.scene_memory.bank_attention.in_proj_weight", + "memory.scene_memory.bank_attention.in_proj_bias", + "memory.scene_memory.bank_attention.out_proj.weight", + "memory.scene_memory.bank_attention.out_proj.bias", + "memory.scene_memory.action_proj.0.weight", + "memory.scene_memory.action_proj.0.bias", + "memory.scene_memory.action_proj.1.weight", + "memory.scene_memory.action_proj.1.bias", + "memory.scene_memory.write_gate.0.weight", + "memory.scene_memory.write_gate.0.bias", + "memory.scene_memory.write_gate.1.weight", + "memory.scene_memory.write_gate.1.bias", + "memory.scene_memory.write_gate.3.weight", + "memory.scene_memory.write_gate.3.bias", + "memory.scene_memory.token_proj.0.weight", + "memory.scene_memory.token_proj.0.bias", + "memory.scene_memory.token_proj.1.weight", + "memory.scene_memory.token_proj.1.bias", + "memory.belief_memory.position_embedding", + "memory.belief_memory.bank_queries", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear1.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear1.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear2.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear2.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm1.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm1.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm2.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm2.bias", + "memory.belief_memory.bank_attention.in_proj_weight", + "memory.belief_memory.bank_attention.in_proj_bias", + "memory.belief_memory.bank_attention.out_proj.weight", + "memory.belief_memory.bank_attention.out_proj.bias", + "memory.belief_memory.action_proj.0.weight", + "memory.belief_memory.action_proj.0.bias", + "memory.belief_memory.action_proj.1.weight", + "memory.belief_memory.action_proj.1.bias", + "memory.belief_memory.write_gate.0.weight", + "memory.belief_memory.write_gate.0.bias", + "memory.belief_memory.write_gate.1.weight", + "memory.belief_memory.write_gate.1.bias", + "memory.belief_memory.write_gate.3.weight", + "memory.belief_memory.write_gate.3.bias", + "memory.belief_memory.token_proj.0.weight", + "memory.belief_memory.token_proj.0.bias", + "memory.belief_memory.token_proj.1.weight", + "memory.belief_memory.token_proj.1.bias", + "decoder.arm_decoder.layers.0.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.linear1.weight", + "decoder.arm_decoder.layers.0.linear1.bias", + "decoder.arm_decoder.layers.0.linear2.weight", + "decoder.arm_decoder.layers.0.linear2.bias", + "decoder.arm_decoder.layers.0.norm1.weight", + "decoder.arm_decoder.layers.0.norm1.bias", + "decoder.arm_decoder.layers.0.norm2.weight", + "decoder.arm_decoder.layers.0.norm2.bias", + "decoder.arm_decoder.layers.0.norm3.weight", + "decoder.arm_decoder.layers.0.norm3.bias", + "decoder.arm_decoder.layers.1.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.linear1.weight", + "decoder.arm_decoder.layers.1.linear1.bias", + "decoder.arm_decoder.layers.1.linear2.weight", + "decoder.arm_decoder.layers.1.linear2.bias", + "decoder.arm_decoder.layers.1.norm1.weight", + "decoder.arm_decoder.layers.1.norm1.bias", + "decoder.arm_decoder.layers.1.norm2.weight", + "decoder.arm_decoder.layers.1.norm2.bias", + "decoder.arm_decoder.layers.1.norm3.weight", + "decoder.arm_decoder.layers.1.norm3.bias", + "decoder.arm_decoder.layers.2.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.linear1.weight", + "decoder.arm_decoder.layers.2.linear1.bias", + "decoder.arm_decoder.layers.2.linear2.weight", + "decoder.arm_decoder.layers.2.linear2.bias", + "decoder.arm_decoder.layers.2.norm1.weight", + "decoder.arm_decoder.layers.2.norm1.bias", + "decoder.arm_decoder.layers.2.norm2.weight", + "decoder.arm_decoder.layers.2.norm2.bias", + "decoder.arm_decoder.layers.2.norm3.weight", + "decoder.arm_decoder.layers.2.norm3.bias", + "decoder.arm_decoder.layers.3.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.linear1.weight", + "decoder.arm_decoder.layers.3.linear1.bias", + "decoder.arm_decoder.layers.3.linear2.weight", + "decoder.arm_decoder.layers.3.linear2.bias", + "decoder.arm_decoder.layers.3.norm1.weight", + "decoder.arm_decoder.layers.3.norm1.bias", + "decoder.arm_decoder.layers.3.norm2.weight", + "decoder.arm_decoder.layers.3.norm2.bias", + "decoder.arm_decoder.layers.3.norm3.weight", + "decoder.arm_decoder.layers.3.norm3.bias", + "decoder.arm_identity.weight", + "decoder.phase_adapter.weight", + "decoder.phase_adapter.bias", + "decoder.role_adapter.weight", + "decoder.role_adapter.bias", + "decoder.context_proj.0.weight", + "decoder.context_proj.0.bias", + "decoder.context_proj.1.weight", + "decoder.context_proj.1.bias", + "decoder.arm_head.0.weight", + "decoder.arm_head.0.bias", + "decoder.arm_head.1.weight", + "decoder.arm_head.1.bias", + "decoder.arm_mean.weight", + "decoder.arm_mean.bias", + "decoder.arm_log_std.weight", + "decoder.arm_log_std.bias", + "decoder.proposal_mode_head.0.weight", + "decoder.proposal_mode_head.0.bias", + "decoder.proposal_mode_head.1.weight", + "decoder.proposal_mode_head.1.bias", + "decoder.proposal_mode_head.3.weight", + "decoder.proposal_mode_head.3.bias", + "decoder.proposal_mode_embeddings.weight", + "decoder.proposal_slot_embeddings.weight", + "decoder.mode_residual_heads.0.0.weight", + "decoder.mode_residual_heads.0.0.bias", + "decoder.mode_residual_heads.0.1.weight", + "decoder.mode_residual_heads.0.1.bias", + "decoder.mode_residual_heads.0.3.weight", + "decoder.mode_residual_heads.0.3.bias", + "decoder.mode_residual_heads.1.0.weight", + "decoder.mode_residual_heads.1.0.bias", + "decoder.mode_residual_heads.1.1.weight", + "decoder.mode_residual_heads.1.1.bias", + "decoder.mode_residual_heads.1.3.weight", + "decoder.mode_residual_heads.1.3.bias", + "decoder.mode_residual_heads.2.0.weight", + "decoder.mode_residual_heads.2.0.bias", + "decoder.mode_residual_heads.2.1.weight", + "decoder.mode_residual_heads.2.1.bias", + "decoder.mode_residual_heads.2.3.weight", + "decoder.mode_residual_heads.2.3.bias", + "decoder.mode_residual_heads.3.0.weight", + "decoder.mode_residual_heads.3.0.bias", + "decoder.mode_residual_heads.3.1.weight", + "decoder.mode_residual_heads.3.1.bias", + "decoder.mode_residual_heads.3.3.weight", + "decoder.mode_residual_heads.3.3.bias", + "decoder.mode_residual_heads.4.0.weight", + "decoder.mode_residual_heads.4.0.bias", + "decoder.mode_residual_heads.4.1.weight", + "decoder.mode_residual_heads.4.1.bias", + "decoder.mode_residual_heads.4.3.weight", + "decoder.mode_residual_heads.4.3.bias", + "decoder.mode_residual_heads.5.0.weight", + "decoder.mode_residual_heads.5.0.bias", + "decoder.mode_residual_heads.5.1.weight", + "decoder.mode_residual_heads.5.1.bias", + "decoder.mode_residual_heads.5.3.weight", + "decoder.mode_residual_heads.5.3.bias", + "decoder.slot_delta.0.weight", + "decoder.slot_delta.0.bias", + "decoder.slot_delta.1.weight", + "decoder.slot_delta.1.bias", + "decoder.slot_delta.3.weight", + "decoder.slot_delta.3.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias", + "decoder.proposal_score.3.weight", + "decoder.proposal_score.3.bias", + "elastic_state_head.interaction_queries", + "elastic_state_head.interaction_attention.in_proj_weight", + "elastic_state_head.interaction_attention.in_proj_bias", + "elastic_state_head.interaction_attention.out_proj.weight", + "elastic_state_head.interaction_attention.out_proj.bias", + "elastic_state_head.interaction_mlp.0.weight", + "elastic_state_head.interaction_mlp.0.bias", + "elastic_state_head.interaction_mlp.1.weight", + "elastic_state_head.interaction_mlp.1.bias", + "elastic_state_head.interaction_mlp.3.weight", + "elastic_state_head.interaction_mlp.3.bias", + "elastic_state_head.decoder.field_queries", + "elastic_state_head.decoder.field_attention.in_proj_weight", + "elastic_state_head.decoder.field_attention.in_proj_bias", + "elastic_state_head.decoder.field_attention.out_proj.weight", + "elastic_state_head.decoder.field_attention.out_proj.bias", + "elastic_state_head.decoder.field_mlp.0.weight", + "elastic_state_head.decoder.field_mlp.0.bias", + "elastic_state_head.decoder.field_mlp.1.weight", + "elastic_state_head.decoder.field_mlp.1.bias", + "elastic_state_head.decoder.field_mlp.3.weight", + "elastic_state_head.decoder.field_mlp.3.bias", + "elastic_state_head.decoder.summary_proj.0.weight", + "elastic_state_head.decoder.summary_proj.0.bias", + "elastic_state_head.decoder.summary_proj.1.weight", + "elastic_state_head.decoder.summary_proj.1.bias", + "elastic_state_head.decoder.phase_head.0.weight", + "elastic_state_head.decoder.phase_head.0.bias", + "elastic_state_head.decoder.phase_head.1.weight", + "elastic_state_head.decoder.phase_head.1.bias", + "elastic_state_head.decoder.phase_head.3.weight", + "elastic_state_head.decoder.phase_head.3.bias", + "elastic_state_head.decoder.arm_role_head.0.weight", + "elastic_state_head.decoder.arm_role_head.0.bias", + "elastic_state_head.decoder.arm_role_head.1.weight", + "elastic_state_head.decoder.arm_role_head.1.bias", + "elastic_state_head.decoder.arm_role_head.3.weight", + "elastic_state_head.decoder.arm_role_head.3.bias", + "elastic_state_head.decoder.arm_identity.weight", + "elastic_state_head.decoder.support_mode.0.weight", + "elastic_state_head.decoder.support_mode.0.bias", + "elastic_state_head.decoder.support_mode.1.weight", + "elastic_state_head.decoder.support_mode.1.bias", + "elastic_state_head.decoder.support_mode.3.weight", + "elastic_state_head.decoder.support_mode.3.bias", + "elastic_state_head.decoder.access_field.weight", + "elastic_state_head.decoder.access_field.bias", + "elastic_state_head.decoder.target_belief_field.weight", + "elastic_state_head.decoder.target_belief_field.bias", + "elastic_state_head.decoder.visibility_field.weight", + "elastic_state_head.decoder.visibility_field.bias", + "elastic_state_head.decoder.clearance_field.weight", + "elastic_state_head.decoder.clearance_field.bias", + "elastic_state_head.decoder.occluder_contact_field.weight", + "elastic_state_head.decoder.occluder_contact_field.bias", + "elastic_state_head.decoder.grasp_affordance_field.weight", + "elastic_state_head.decoder.grasp_affordance_field.bias", + "elastic_state_head.decoder.support_stability_field.weight", + "elastic_state_head.decoder.support_stability_field.bias", + "elastic_state_head.decoder.persistence_field.weight", + "elastic_state_head.decoder.persistence_field.bias", + "elastic_state_head.decoder.reocclusion_field.weight", + "elastic_state_head.decoder.reocclusion_field.bias", + "elastic_state_head.decoder.disturbance_field.weight", + "elastic_state_head.decoder.disturbance_field.bias", + "elastic_state_head.decoder.uncertainty_field.weight", + "elastic_state_head.decoder.uncertainty_field.bias", + "elastic_state_head.decoder.reocclusion_head.0.weight", + "elastic_state_head.decoder.reocclusion_head.0.bias", + "elastic_state_head.decoder.reocclusion_head.1.weight", + "elastic_state_head.decoder.reocclusion_head.1.bias", + "elastic_state_head.decoder.reocclusion_head.3.weight", + "elastic_state_head.decoder.reocclusion_head.3.bias", + "world_model.state_encoder.0.weight", + "world_model.state_encoder.0.bias", + "world_model.state_encoder.1.weight", + "world_model.state_encoder.1.bias", + "world_model.scene_memory_proj.0.weight", + "world_model.scene_memory_proj.0.bias", + "world_model.scene_memory_proj.1.weight", + "world_model.scene_memory_proj.1.bias", + "world_model.belief_memory_proj.0.weight", + "world_model.belief_memory_proj.0.bias", + "world_model.belief_memory_proj.1.weight", + "world_model.belief_memory_proj.1.bias", + "world_model.action_encoder.0.weight", + "world_model.action_encoder.0.bias", + "world_model.action_encoder.1.weight", + "world_model.action_encoder.1.bias", + "world_model.transition.weight_ih", + "world_model.transition.weight_hh", + "world_model.transition.bias_ih", + "world_model.transition.bias_hh", + "world_model.scene_memory_update.weight", + "world_model.scene_memory_update.bias", + "world_model.belief_memory_update.weight", + "world_model.belief_memory_update.bias", + "world_model.compact_decoder.weight", + "world_model.compact_decoder.bias", + "world_model.target_belief_head.weight", + "world_model.target_belief_head.bias", + "world_model.visibility_head.weight", + "world_model.visibility_head.bias", + "world_model.clearance_head.weight", + "world_model.clearance_head.bias", + "world_model.occluder_contact_head.weight", + "world_model.occluder_contact_head.bias", + "world_model.grasp_affordance_head.weight", + "world_model.grasp_affordance_head.bias", + "world_model.support_stability_head.weight", + "world_model.support_stability_head.bias", + "world_model.persistence_head.weight", + "world_model.persistence_head.bias", + "world_model.reocclusion_head.weight", + "world_model.reocclusion_head.bias", + "world_model.disturbance_head.weight", + "world_model.disturbance_head.bias", + "world_model.uncertainty_head.weight", + "world_model.uncertainty_head.bias", + "world_model.access_head.weight", + "world_model.access_head.bias", + "planner.residual.trunk.0.weight", + "planner.residual.trunk.0.bias", + "planner.residual.trunk.1.weight", + "planner.residual.trunk.1.bias", + "planner.residual.trunk.3.weight", + "planner.residual.trunk.3.bias", + "planner.residual.success_head.weight", + "planner.residual.success_head.bias", + "planner.residual.risk_head.weight", + "planner.residual.risk_head.bias", + "planner.residual.residual_head.weight", + "planner.residual.residual_head.bias" + ], + "unexpected_keys": [] + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..1adcab281dc7f4bdb68aa7ad2d6156a5d63ba9ea --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.25, + "bag_proxy": 0.20833333333333334, + "cloth_proxy": 0.5833333333333334 + }, + "mean_success": 0.34722222222222227, + "visibility_integral": 19.064177172051537, + "corridor_availability": 0.5252470484831266, + "reocclusion_rate": 0.034895833333333334, + "persistence_horizon_mae": 2.8043047013660196, + "disturbance_cost": 0.100128799987336 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..05966447af0dccbac2ba89e43d47a3c0157d24fa --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/checkpoint_best.pt +- mean_success: 0.347 +- visibility_integral: 19.064 +- corridor_availability: 0.525 +- reocclusion_rate: 0.035 +- persistence_horizon_mae: 2.804 +- disturbance_cost: 0.100 +- foliage_proxy_success: 0.250 +- bag_proxy_success: 0.208 +- cloth_proxy_success: 0.583 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..51410160afba1bc080f013abc06d9c7e4edfc9f1 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/config_resolved.yaml @@ -0,0 +1,149 @@ +experiment_name: proxy_interaction_r3d_stage2_clip_seed12 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 12 +init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt +init_strict: false +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 224 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage2_seed12.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage2_seed12.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 12 +optim: + epochs: 4 + batch_size: 2 + num_workers: 4 + lr: 0.0003 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: true + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 512 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: false + fusion: + hidden_dim: 512 + num_cameras: 3 + num_layers: 4 + num_heads: 8 + ff_dim: 2048 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 512 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 8 + max_history_steps: 8 + decoder: + hidden_dim: 512 + num_heads: 8 + num_layers: 4 + ff_dim: 2048 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 512 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 8 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 512 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 8 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 512 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 8 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.1 + arm_role: 0.15 + support_mode: 0.1 + corridor: 0.15 + persistence: 0.05 + disturbance: 0.05 + world_model: 0.25 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.25 + planner_risk: 0.1 + planner_ranking: 0.2 + proposal_reconstruction: 0.1 + proposal_success: 0.15 + proposal_ranking: 0.2 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..629544cc9b35c7162fa2ca945991b0bcf53d4d9e --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.2692307692307692, + "planner_regret": 0.015571335330605507, + "planner_score_utility_spearman": 0.2846153974533081, + "risk_calibration_mse": 0.010228095576167107, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.025593629106879234, + "left_right_equivariance_error": 0.0001871140535292921, + "belief_calibration_brier": 0.006486459169536829, + "reocclusion_calibration_brier": 0.24318400025367737, + "support_stability_mae": 0.0361579954624176, + "clearance_auc": 0.6852405197686325, + "memory_write_rate": 0.13076923787593842, + "memory_saturation": 0.5033961534500122, + "num_samples": 130 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..b7a32c66fa16a18d74af8a92e02e794db3f131f8 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.0265116647753806, + "arm_role": 0.026648694620082517, + "belief": 0.12265744023220078, + "clearance": 0.09419052814820986, + "corridor": 0.29069200661325956, + "disturbance": 0.0063096336832193685, + "grasp_affordance": 0.02155034775060665, + "occluder_contact": 0.21986118271088725, + "persistence": 7.344096696604024, + "phase": 0.7511836346531413, + "planner_ranking": 0.1567143966832472, + "planner_risk": 0.01613354602277325, + "planner_success": 0.6087345007358421, + "proposal_diversity": 0.0, + "proposal_ranking": 1.2436198916734826, + "proposal_reconstruction": 0.06823750427334097, + "proposal_success": 0.680850050212201, + "reocclusion": 0.7201950554760339, + "role_swap_consistency": 0.0004218729012962723, + "support_mode": 0.7505828697643979, + "support_stability": 0.15377593591823902, + "total": 1.8732728003207302, + "uncertainty": 0.028985263621364478, + "visibility": 0.11896002095641266, + "world_model": 2.6631099815768096 + }, + "val": { + "action": 0.021836393154584445, + "arm_role": 8.102541575611283e-06, + "belief": 0.09969789322752219, + "clearance": 0.08271008575191864, + "corridor": 0.24081495908590464, + "disturbance": 0.0023920218258773763, + "grasp_affordance": 0.01155611932134399, + "occluder_contact": 0.20507212510475745, + "persistence": 4.512984638947707, + "phase": 0.6603462411807134, + "planner_ranking": 0.04704892609734088, + "planner_risk": 0.010522140137170656, + "planner_success": 0.52820757489938, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1522641053566567, + "proposal_reconstruction": 0.06444322845110527, + "proposal_success": 0.6650473337907058, + "reocclusion": 0.6900336522322434, + "role_swap_consistency": 0.0, + "support_mode": 0.6587190958169791, + "support_stability": 0.1457874579498401, + "total": 1.5405189422460703, + "uncertainty": 0.0072207282010752424, + "visibility": 0.09836944525058453, + "world_model": 2.3159281272154586 + } + }, + { + "epoch": 1, + "train": { + "action": 0.02143435196278139, + "arm_role": 2.7934918228868416e-05, + "belief": 0.10347749129015738, + "clearance": 0.08618570594068285, + "corridor": 0.24549250739641215, + "disturbance": 0.0026278662473882427, + "grasp_affordance": 0.010813114165050508, + "occluder_contact": 0.21092523938698293, + "persistence": 4.392642458071883, + "phase": 0.6784450670811518, + "planner_ranking": 0.04968888171078444, + "planner_risk": 0.011175002839967257, + "planner_success": 0.5800106824614615, + "proposal_diversity": 0.0, + "proposal_ranking": 1.150295885757626, + "proposal_reconstruction": 0.06383146423631937, + "proposal_success": 0.6773600406671694, + "reocclusion": 0.7005154393730363, + "role_swap_consistency": 0.00020737489747128533, + "support_mode": 0.695506789921466, + "support_stability": 0.1444786221413088, + "total": 1.5313815312235768, + "uncertainty": 0.004678997660972451, + "visibility": 0.10154765454262339, + "world_model": 2.215607233384517 + }, + "val": { + "action": 0.021598762689301602, + "arm_role": 1.559978859754315e-05, + "belief": 0.10402895246560757, + "clearance": 0.08615114350731556, + "corridor": 0.24378756766135876, + "disturbance": 0.0017933934510857888, + "grasp_affordance": 0.00965615829023031, + "occluder_contact": 0.22014242937931647, + "persistence": 3.8692049705065212, + "phase": 0.6658917142794682, + "planner_ranking": 0.034095349999608095, + "planner_risk": 0.010596161578835634, + "planner_success": 0.5355585918976711, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1134835408284114, + "proposal_reconstruction": 0.06421315005192389, + "proposal_success": 0.6758711869900044, + "reocclusion": 0.6889989018440247, + "role_swap_consistency": 0.0, + "support_mode": 0.660473997776325, + "support_stability": 0.14310103247945125, + "total": 1.4629831167367788, + "uncertainty": 0.0009627942819721424, + "visibility": 0.10057846101430747, + "world_model": 2.1548714188429026 + } + }, + { + "epoch": 2, + "train": { + "action": 0.021090031074876874, + "arm_role": 0.00018929935874739242, + "belief": 0.10672742253198673, + "clearance": 0.08872412362608922, + "corridor": 0.2733879856331303, + "disturbance": 0.0030289446660285483, + "grasp_affordance": 0.01181759231562936, + "occluder_contact": 0.21809014919852712, + "persistence": 3.656308846635968, + "phase": 0.6599919983229712, + "planner_ranking": 0.041442906926855566, + "planner_risk": 0.010613277656603994, + "planner_success": 0.521210808092387, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1467299227315093, + "proposal_reconstruction": 0.06341781315067052, + "proposal_success": 0.6737371065229646, + "reocclusion": 0.6779327008639174, + "role_swap_consistency": 0.00022924937225094415, + "support_mode": 0.6753841819563461, + "support_stability": 0.14452538651009506, + "total": 1.4377010383531061, + "uncertainty": 0.0023537015170198385, + "visibility": 0.10841247750475456, + "world_model": 2.0592898433121087 + }, + "val": { + "action": 0.021482723922683643, + "arm_role": 1.9337384835055744e-05, + "belief": 0.1271346492262987, + "clearance": 0.083377072845514, + "corridor": 0.2741409402913772, + "disturbance": 0.002117429635165116, + "grasp_affordance": 0.011649288172618701, + "occluder_contact": 0.21174047933175014, + "persistence": 3.8564615689791166, + "phase": 0.6485314034498655, + "planner_ranking": 0.03306306126446893, + "planner_risk": 0.01047296588210604, + "planner_success": 0.5441290960862086, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1324434812252338, + "proposal_reconstruction": 0.06412716972140166, + "proposal_success": 0.6779822074449979, + "reocclusion": 0.6870533975271078, + "role_swap_consistency": 0.0, + "support_mode": 0.6577057308875598, + "support_stability": 0.14096688464857066, + "total": 1.4323132129815908, + "uncertainty": 0.0027863349163313755, + "visibility": 0.10942233365315657, + "world_model": 1.9970711038662836 + } + }, + { + "epoch": 3, + "train": { + "action": 0.020458291170153162, + "arm_role": 0.00011509968972330942, + "belief": 0.11413928630386347, + "clearance": 0.09032999263852054, + "corridor": 0.2853015211679917, + "disturbance": 0.0033650345857184284, + "grasp_affordance": 0.011570076631255331, + "occluder_contact": 0.22306315101566115, + "persistence": 2.4543060132619727, + "phase": 0.5411919998248834, + "planner_ranking": 0.03125114804425049, + "planner_risk": 0.010697233745266307, + "planner_success": 0.5028430128674857, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1347670686182552, + "proposal_reconstruction": 0.06315274327915377, + "proposal_success": 0.677255996234754, + "reocclusion": 0.43758569880393555, + "role_swap_consistency": 0.0010055845596642276, + "support_mode": 0.44074948295872873, + "support_stability": 0.13863542222988387, + "total": 1.3077349625332817, + "uncertainty": 0.0008059210962010913, + "visibility": 0.12058865647587477, + "world_model": 1.9945788954565038 + }, + "val": { + "action": 0.01820472377137496, + "arm_role": 2.3236593000101145e-06, + "belief": 0.11443154307512136, + "clearance": 0.08722961630958777, + "corridor": 0.24973363708346508, + "disturbance": 0.0022052301745973707, + "grasp_affordance": 0.00954639521212532, + "occluder_contact": 0.22095146018725176, + "persistence": 1.743605894180767, + "phase": 0.3354514233964997, + "planner_ranking": 0.03021946732674573, + "planner_risk": 0.0104008026027049, + "planner_success": 0.5254414299359689, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1170437849484958, + "proposal_reconstruction": 0.06222414580675272, + "proposal_success": 0.6824415674576393, + "reocclusion": 0.3508238720635955, + "role_swap_consistency": 0.0, + "support_mode": 0.27097287286932653, + "support_stability": 0.1482541099190712, + "total": 1.2100626652057354, + "uncertainty": 5.787161892910192e-05, + "visibility": 0.11422394622976963, + "world_model": 1.9349967433856083 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..805e5df9f7a215d903f3f49df365decfd4ae4614 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/summary.json @@ -0,0 +1,557 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage2_clip_seed12", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed12/checkpoint_best.pt", + "final_train_total": 1.3077349625332817, + "final_val_total": 1.2100626652057354, + "train_time_sec": 146.35694217681885, + "peak_gpu_memory_mb": 1917.4189453125, + "num_train_samples": 381, + "num_val_samples": 130, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": { + "path": "/workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt", + "loaded_keys": 461, + "skipped_shape_mismatch_keys": [ + "memory.gru.weight_ih_l0", + "memory.gru.weight_hh_l0", + "memory.gru.bias_ih_l0", + "memory.gru.bias_hh_l0", + "memory.token_proj.0.weight", + "memory.token_proj.0.bias", + "memory.token_proj.1.weight", + "memory.token_proj.1.bias", + "decoder.actor_role_bias", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.linear1.weight", + "decoder.revealer_decoder.layers.0.linear1.bias", + "decoder.revealer_decoder.layers.0.linear2.weight", + "decoder.revealer_decoder.layers.0.linear2.bias", + "decoder.revealer_decoder.layers.0.norm1.weight", + "decoder.revealer_decoder.layers.0.norm1.bias", + "decoder.revealer_decoder.layers.0.norm2.weight", + "decoder.revealer_decoder.layers.0.norm2.bias", + "decoder.revealer_decoder.layers.0.norm3.weight", + "decoder.revealer_decoder.layers.0.norm3.bias", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.linear1.weight", + "decoder.revealer_decoder.layers.1.linear1.bias", + "decoder.revealer_decoder.layers.1.linear2.weight", + "decoder.revealer_decoder.layers.1.linear2.bias", + "decoder.revealer_decoder.layers.1.norm1.weight", + "decoder.revealer_decoder.layers.1.norm1.bias", + "decoder.revealer_decoder.layers.1.norm2.weight", + "decoder.revealer_decoder.layers.1.norm2.bias", + "decoder.revealer_decoder.layers.1.norm3.weight", + "decoder.revealer_decoder.layers.1.norm3.bias", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.linear1.weight", + "decoder.revealer_decoder.layers.2.linear1.bias", + "decoder.revealer_decoder.layers.2.linear2.weight", + "decoder.revealer_decoder.layers.2.linear2.bias", + "decoder.revealer_decoder.layers.2.norm1.weight", + "decoder.revealer_decoder.layers.2.norm1.bias", + "decoder.revealer_decoder.layers.2.norm2.weight", + "decoder.revealer_decoder.layers.2.norm2.bias", + "decoder.revealer_decoder.layers.2.norm3.weight", + "decoder.revealer_decoder.layers.2.norm3.bias", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.linear1.weight", + "decoder.revealer_decoder.layers.3.linear1.bias", + "decoder.revealer_decoder.layers.3.linear2.weight", + "decoder.revealer_decoder.layers.3.linear2.bias", + "decoder.revealer_decoder.layers.3.norm1.weight", + "decoder.revealer_decoder.layers.3.norm1.bias", + "decoder.revealer_decoder.layers.3.norm2.weight", + "decoder.revealer_decoder.layers.3.norm2.bias", + "decoder.revealer_decoder.layers.3.norm3.weight", + "decoder.revealer_decoder.layers.3.norm3.bias", + "decoder.actor_decoder.layers.0.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.linear1.weight", + "decoder.actor_decoder.layers.0.linear1.bias", + "decoder.actor_decoder.layers.0.linear2.weight", + "decoder.actor_decoder.layers.0.linear2.bias", + "decoder.actor_decoder.layers.0.norm1.weight", + "decoder.actor_decoder.layers.0.norm1.bias", + "decoder.actor_decoder.layers.0.norm2.weight", + "decoder.actor_decoder.layers.0.norm2.bias", + "decoder.actor_decoder.layers.0.norm3.weight", + "decoder.actor_decoder.layers.0.norm3.bias", + "decoder.actor_decoder.layers.1.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.linear1.weight", + "decoder.actor_decoder.layers.1.linear1.bias", + "decoder.actor_decoder.layers.1.linear2.weight", + "decoder.actor_decoder.layers.1.linear2.bias", + "decoder.actor_decoder.layers.1.norm1.weight", + "decoder.actor_decoder.layers.1.norm1.bias", + "decoder.actor_decoder.layers.1.norm2.weight", + "decoder.actor_decoder.layers.1.norm2.bias", + "decoder.actor_decoder.layers.1.norm3.weight", + "decoder.actor_decoder.layers.1.norm3.bias", + "decoder.actor_decoder.layers.2.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.linear1.weight", + "decoder.actor_decoder.layers.2.linear1.bias", + "decoder.actor_decoder.layers.2.linear2.weight", + "decoder.actor_decoder.layers.2.linear2.bias", + "decoder.actor_decoder.layers.2.norm1.weight", + "decoder.actor_decoder.layers.2.norm1.bias", + "decoder.actor_decoder.layers.2.norm2.weight", + "decoder.actor_decoder.layers.2.norm2.bias", + "decoder.actor_decoder.layers.2.norm3.weight", + "decoder.actor_decoder.layers.2.norm3.bias", + "decoder.actor_decoder.layers.3.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.linear1.weight", + "decoder.actor_decoder.layers.3.linear1.bias", + "decoder.actor_decoder.layers.3.linear2.weight", + "decoder.actor_decoder.layers.3.linear2.bias", + "decoder.actor_decoder.layers.3.norm1.weight", + "decoder.actor_decoder.layers.3.norm1.bias", + "decoder.actor_decoder.layers.3.norm2.weight", + "decoder.actor_decoder.layers.3.norm2.bias", + "decoder.actor_decoder.layers.3.norm3.weight", + "decoder.actor_decoder.layers.3.norm3.bias", + "decoder.revealer_mean.weight", + "decoder.revealer_mean.bias", + "decoder.revealer_log_std.weight", + "decoder.revealer_log_std.bias", + "decoder.actor_mean.weight", + "decoder.actor_mean.bias", + "decoder.actor_log_std.weight", + "decoder.actor_log_std.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias" + ], + "missing_keys": [ + "backbone.depth_adapter.depth_proj.0.weight", + "backbone.depth_adapter.depth_proj.0.bias", + "backbone.depth_adapter.depth_proj.1.weight", + "backbone.depth_adapter.depth_proj.1.bias", + "backbone.depth_adapter.depth_proj.3.weight", + "backbone.depth_adapter.depth_proj.3.bias", + "backbone.depth_adapter.geometry_proj.0.weight", + "backbone.depth_adapter.geometry_proj.0.bias", + "backbone.depth_adapter.geometry_proj.1.weight", + "backbone.depth_adapter.geometry_proj.1.bias", + "backbone.depth_adapter.camera_proj.0.weight", + "backbone.depth_adapter.camera_proj.0.bias", + "backbone.depth_adapter.camera_proj.1.weight", + "backbone.depth_adapter.camera_proj.1.bias", + "fusion.geometry_fusion.attn.in_proj_weight", + "fusion.geometry_fusion.attn.in_proj_bias", + "fusion.geometry_fusion.attn.out_proj.weight", + "fusion.geometry_fusion.attn.out_proj.bias", + "fusion.geometry_fusion.gate.0.weight", + "fusion.geometry_fusion.gate.0.bias", + "fusion.geometry_fusion.gate.1.weight", + "fusion.geometry_fusion.gate.1.bias", + "fusion.geometry_fusion.gate.3.weight", + "fusion.geometry_fusion.gate.3.bias", + "fusion.geometry_fusion.out.0.weight", + "fusion.geometry_fusion.out.0.bias", + "fusion.geometry_fusion.out.1.weight", + "fusion.geometry_fusion.out.1.bias", + "memory.scene_memory.position_embedding", + "memory.scene_memory.bank_queries", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear1.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear1.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear2.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear2.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm1.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm1.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm2.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm2.bias", + "memory.scene_memory.bank_attention.in_proj_weight", + "memory.scene_memory.bank_attention.in_proj_bias", + "memory.scene_memory.bank_attention.out_proj.weight", + "memory.scene_memory.bank_attention.out_proj.bias", + "memory.scene_memory.action_proj.0.weight", + "memory.scene_memory.action_proj.0.bias", + "memory.scene_memory.action_proj.1.weight", + "memory.scene_memory.action_proj.1.bias", + "memory.scene_memory.write_gate.0.weight", + "memory.scene_memory.write_gate.0.bias", + "memory.scene_memory.write_gate.1.weight", + "memory.scene_memory.write_gate.1.bias", + "memory.scene_memory.write_gate.3.weight", + "memory.scene_memory.write_gate.3.bias", + "memory.scene_memory.token_proj.0.weight", + "memory.scene_memory.token_proj.0.bias", + "memory.scene_memory.token_proj.1.weight", + "memory.scene_memory.token_proj.1.bias", + "memory.belief_memory.position_embedding", + "memory.belief_memory.bank_queries", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear1.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear1.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear2.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear2.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm1.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm1.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm2.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm2.bias", + "memory.belief_memory.bank_attention.in_proj_weight", + "memory.belief_memory.bank_attention.in_proj_bias", + "memory.belief_memory.bank_attention.out_proj.weight", + "memory.belief_memory.bank_attention.out_proj.bias", + "memory.belief_memory.action_proj.0.weight", + "memory.belief_memory.action_proj.0.bias", + "memory.belief_memory.action_proj.1.weight", + "memory.belief_memory.action_proj.1.bias", + "memory.belief_memory.write_gate.0.weight", + "memory.belief_memory.write_gate.0.bias", + "memory.belief_memory.write_gate.1.weight", + "memory.belief_memory.write_gate.1.bias", + "memory.belief_memory.write_gate.3.weight", + "memory.belief_memory.write_gate.3.bias", + "memory.belief_memory.token_proj.0.weight", + "memory.belief_memory.token_proj.0.bias", + "memory.belief_memory.token_proj.1.weight", + "memory.belief_memory.token_proj.1.bias", + "decoder.arm_decoder.layers.0.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.linear1.weight", + "decoder.arm_decoder.layers.0.linear1.bias", + "decoder.arm_decoder.layers.0.linear2.weight", + "decoder.arm_decoder.layers.0.linear2.bias", + "decoder.arm_decoder.layers.0.norm1.weight", + "decoder.arm_decoder.layers.0.norm1.bias", + "decoder.arm_decoder.layers.0.norm2.weight", + "decoder.arm_decoder.layers.0.norm2.bias", + "decoder.arm_decoder.layers.0.norm3.weight", + "decoder.arm_decoder.layers.0.norm3.bias", + "decoder.arm_decoder.layers.1.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.linear1.weight", + "decoder.arm_decoder.layers.1.linear1.bias", + "decoder.arm_decoder.layers.1.linear2.weight", + "decoder.arm_decoder.layers.1.linear2.bias", + "decoder.arm_decoder.layers.1.norm1.weight", + "decoder.arm_decoder.layers.1.norm1.bias", + "decoder.arm_decoder.layers.1.norm2.weight", + "decoder.arm_decoder.layers.1.norm2.bias", + "decoder.arm_decoder.layers.1.norm3.weight", + "decoder.arm_decoder.layers.1.norm3.bias", + "decoder.arm_decoder.layers.2.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.linear1.weight", + "decoder.arm_decoder.layers.2.linear1.bias", + "decoder.arm_decoder.layers.2.linear2.weight", + "decoder.arm_decoder.layers.2.linear2.bias", + "decoder.arm_decoder.layers.2.norm1.weight", + "decoder.arm_decoder.layers.2.norm1.bias", + "decoder.arm_decoder.layers.2.norm2.weight", + "decoder.arm_decoder.layers.2.norm2.bias", + "decoder.arm_decoder.layers.2.norm3.weight", + "decoder.arm_decoder.layers.2.norm3.bias", + "decoder.arm_decoder.layers.3.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.linear1.weight", + "decoder.arm_decoder.layers.3.linear1.bias", + "decoder.arm_decoder.layers.3.linear2.weight", + "decoder.arm_decoder.layers.3.linear2.bias", + "decoder.arm_decoder.layers.3.norm1.weight", + "decoder.arm_decoder.layers.3.norm1.bias", + "decoder.arm_decoder.layers.3.norm2.weight", + "decoder.arm_decoder.layers.3.norm2.bias", + "decoder.arm_decoder.layers.3.norm3.weight", + "decoder.arm_decoder.layers.3.norm3.bias", + "decoder.arm_identity.weight", + "decoder.phase_adapter.weight", + "decoder.phase_adapter.bias", + "decoder.role_adapter.weight", + "decoder.role_adapter.bias", + "decoder.context_proj.0.weight", + "decoder.context_proj.0.bias", + "decoder.context_proj.1.weight", + "decoder.context_proj.1.bias", + "decoder.arm_head.0.weight", + "decoder.arm_head.0.bias", + "decoder.arm_head.1.weight", + "decoder.arm_head.1.bias", + "decoder.arm_mean.weight", + "decoder.arm_mean.bias", + "decoder.arm_log_std.weight", + "decoder.arm_log_std.bias", + "decoder.proposal_mode_head.0.weight", + "decoder.proposal_mode_head.0.bias", + "decoder.proposal_mode_head.1.weight", + "decoder.proposal_mode_head.1.bias", + "decoder.proposal_mode_head.3.weight", + "decoder.proposal_mode_head.3.bias", + "decoder.proposal_mode_embeddings.weight", + "decoder.proposal_slot_embeddings.weight", + "decoder.mode_residual_heads.0.0.weight", + "decoder.mode_residual_heads.0.0.bias", + "decoder.mode_residual_heads.0.1.weight", + "decoder.mode_residual_heads.0.1.bias", + "decoder.mode_residual_heads.0.3.weight", + "decoder.mode_residual_heads.0.3.bias", + "decoder.mode_residual_heads.1.0.weight", + "decoder.mode_residual_heads.1.0.bias", + "decoder.mode_residual_heads.1.1.weight", + "decoder.mode_residual_heads.1.1.bias", + "decoder.mode_residual_heads.1.3.weight", + "decoder.mode_residual_heads.1.3.bias", + "decoder.mode_residual_heads.2.0.weight", + "decoder.mode_residual_heads.2.0.bias", + "decoder.mode_residual_heads.2.1.weight", + "decoder.mode_residual_heads.2.1.bias", + "decoder.mode_residual_heads.2.3.weight", + "decoder.mode_residual_heads.2.3.bias", + "decoder.mode_residual_heads.3.0.weight", + "decoder.mode_residual_heads.3.0.bias", + "decoder.mode_residual_heads.3.1.weight", + "decoder.mode_residual_heads.3.1.bias", + "decoder.mode_residual_heads.3.3.weight", + "decoder.mode_residual_heads.3.3.bias", + "decoder.mode_residual_heads.4.0.weight", + "decoder.mode_residual_heads.4.0.bias", + "decoder.mode_residual_heads.4.1.weight", + "decoder.mode_residual_heads.4.1.bias", + "decoder.mode_residual_heads.4.3.weight", + "decoder.mode_residual_heads.4.3.bias", + "decoder.mode_residual_heads.5.0.weight", + "decoder.mode_residual_heads.5.0.bias", + "decoder.mode_residual_heads.5.1.weight", + "decoder.mode_residual_heads.5.1.bias", + "decoder.mode_residual_heads.5.3.weight", + "decoder.mode_residual_heads.5.3.bias", + "decoder.slot_delta.0.weight", + "decoder.slot_delta.0.bias", + "decoder.slot_delta.1.weight", + "decoder.slot_delta.1.bias", + "decoder.slot_delta.3.weight", + "decoder.slot_delta.3.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias", + "decoder.proposal_score.3.weight", + "decoder.proposal_score.3.bias", + "elastic_state_head.interaction_queries", + "elastic_state_head.interaction_attention.in_proj_weight", + "elastic_state_head.interaction_attention.in_proj_bias", + "elastic_state_head.interaction_attention.out_proj.weight", + "elastic_state_head.interaction_attention.out_proj.bias", + "elastic_state_head.interaction_mlp.0.weight", + "elastic_state_head.interaction_mlp.0.bias", + "elastic_state_head.interaction_mlp.1.weight", + "elastic_state_head.interaction_mlp.1.bias", + "elastic_state_head.interaction_mlp.3.weight", + "elastic_state_head.interaction_mlp.3.bias", + "elastic_state_head.decoder.field_queries", + "elastic_state_head.decoder.field_attention.in_proj_weight", + "elastic_state_head.decoder.field_attention.in_proj_bias", + "elastic_state_head.decoder.field_attention.out_proj.weight", + "elastic_state_head.decoder.field_attention.out_proj.bias", + "elastic_state_head.decoder.field_mlp.0.weight", + "elastic_state_head.decoder.field_mlp.0.bias", + "elastic_state_head.decoder.field_mlp.1.weight", + "elastic_state_head.decoder.field_mlp.1.bias", + "elastic_state_head.decoder.field_mlp.3.weight", + "elastic_state_head.decoder.field_mlp.3.bias", + "elastic_state_head.decoder.summary_proj.0.weight", + "elastic_state_head.decoder.summary_proj.0.bias", + "elastic_state_head.decoder.summary_proj.1.weight", + "elastic_state_head.decoder.summary_proj.1.bias", + "elastic_state_head.decoder.phase_head.0.weight", + "elastic_state_head.decoder.phase_head.0.bias", + "elastic_state_head.decoder.phase_head.1.weight", + "elastic_state_head.decoder.phase_head.1.bias", + "elastic_state_head.decoder.phase_head.3.weight", + "elastic_state_head.decoder.phase_head.3.bias", + "elastic_state_head.decoder.arm_role_head.0.weight", + "elastic_state_head.decoder.arm_role_head.0.bias", + "elastic_state_head.decoder.arm_role_head.1.weight", + "elastic_state_head.decoder.arm_role_head.1.bias", + "elastic_state_head.decoder.arm_role_head.3.weight", + "elastic_state_head.decoder.arm_role_head.3.bias", + "elastic_state_head.decoder.arm_identity.weight", + "elastic_state_head.decoder.support_mode.0.weight", + "elastic_state_head.decoder.support_mode.0.bias", + "elastic_state_head.decoder.support_mode.1.weight", + "elastic_state_head.decoder.support_mode.1.bias", + "elastic_state_head.decoder.support_mode.3.weight", + "elastic_state_head.decoder.support_mode.3.bias", + "elastic_state_head.decoder.access_field.weight", + "elastic_state_head.decoder.access_field.bias", + "elastic_state_head.decoder.target_belief_field.weight", + "elastic_state_head.decoder.target_belief_field.bias", + "elastic_state_head.decoder.visibility_field.weight", + "elastic_state_head.decoder.visibility_field.bias", + "elastic_state_head.decoder.clearance_field.weight", + "elastic_state_head.decoder.clearance_field.bias", + "elastic_state_head.decoder.occluder_contact_field.weight", + "elastic_state_head.decoder.occluder_contact_field.bias", + "elastic_state_head.decoder.grasp_affordance_field.weight", + "elastic_state_head.decoder.grasp_affordance_field.bias", + "elastic_state_head.decoder.support_stability_field.weight", + "elastic_state_head.decoder.support_stability_field.bias", + "elastic_state_head.decoder.persistence_field.weight", + "elastic_state_head.decoder.persistence_field.bias", + "elastic_state_head.decoder.reocclusion_field.weight", + "elastic_state_head.decoder.reocclusion_field.bias", + "elastic_state_head.decoder.disturbance_field.weight", + "elastic_state_head.decoder.disturbance_field.bias", + "elastic_state_head.decoder.uncertainty_field.weight", + "elastic_state_head.decoder.uncertainty_field.bias", + "elastic_state_head.decoder.reocclusion_head.0.weight", + "elastic_state_head.decoder.reocclusion_head.0.bias", + "elastic_state_head.decoder.reocclusion_head.1.weight", + "elastic_state_head.decoder.reocclusion_head.1.bias", + "elastic_state_head.decoder.reocclusion_head.3.weight", + "elastic_state_head.decoder.reocclusion_head.3.bias", + "world_model.state_encoder.0.weight", + "world_model.state_encoder.0.bias", + "world_model.state_encoder.1.weight", + "world_model.state_encoder.1.bias", + "world_model.scene_memory_proj.0.weight", + "world_model.scene_memory_proj.0.bias", + "world_model.scene_memory_proj.1.weight", + "world_model.scene_memory_proj.1.bias", + "world_model.belief_memory_proj.0.weight", + "world_model.belief_memory_proj.0.bias", + "world_model.belief_memory_proj.1.weight", + "world_model.belief_memory_proj.1.bias", + "world_model.action_encoder.0.weight", + "world_model.action_encoder.0.bias", + "world_model.action_encoder.1.weight", + "world_model.action_encoder.1.bias", + "world_model.transition.weight_ih", + "world_model.transition.weight_hh", + "world_model.transition.bias_ih", + "world_model.transition.bias_hh", + "world_model.scene_memory_update.weight", + "world_model.scene_memory_update.bias", + "world_model.belief_memory_update.weight", + "world_model.belief_memory_update.bias", + "world_model.compact_decoder.weight", + "world_model.compact_decoder.bias", + "world_model.target_belief_head.weight", + "world_model.target_belief_head.bias", + "world_model.visibility_head.weight", + "world_model.visibility_head.bias", + "world_model.clearance_head.weight", + "world_model.clearance_head.bias", + "world_model.occluder_contact_head.weight", + "world_model.occluder_contact_head.bias", + "world_model.grasp_affordance_head.weight", + "world_model.grasp_affordance_head.bias", + "world_model.support_stability_head.weight", + "world_model.support_stability_head.bias", + "world_model.persistence_head.weight", + "world_model.persistence_head.bias", + "world_model.reocclusion_head.weight", + "world_model.reocclusion_head.bias", + "world_model.disturbance_head.weight", + "world_model.disturbance_head.bias", + "world_model.uncertainty_head.weight", + "world_model.uncertainty_head.bias", + "world_model.access_head.weight", + "world_model.access_head.bias", + "planner.residual.trunk.0.weight", + "planner.residual.trunk.0.bias", + "planner.residual.trunk.1.weight", + "planner.residual.trunk.1.bias", + "planner.residual.trunk.3.weight", + "planner.residual.trunk.3.bias", + "planner.residual.success_head.weight", + "planner.residual.success_head.bias", + "planner.residual.risk_head.weight", + "planner.residual.risk_head.bias", + "planner.residual.residual_head.weight", + "planner.residual.residual_head.bias" + ], + "unexpected_keys": [] + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..5628bc52a79f9b40c1c662dfe5cba2adb1453f63 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.5, + "bag_proxy": 0.5833333333333334, + "cloth_proxy": 0.7083333333333334 + }, + "mean_success": 0.5972222222222223, + "visibility_integral": 31.123170379135345, + "corridor_availability": 0.8694257512688637, + "reocclusion_rate": 0.00034722222222222224, + "persistence_horizon_mae": 1.8432530318753104, + "disturbance_cost": 0.32384756999090314 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..b8724500f3e85627353cca30db27bb5e451a1c61 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/checkpoint_best.pt +- mean_success: 0.597 +- visibility_integral: 31.123 +- corridor_availability: 0.869 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 1.843 +- disturbance_cost: 0.324 +- foliage_proxy_success: 0.500 +- bag_proxy_success: 0.583 +- cloth_proxy_success: 0.708 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cfd523ac65ba14b15a6abef147c17da31bc6968c --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/config_resolved.yaml @@ -0,0 +1,149 @@ +experiment_name: proxy_interaction_r3d_stage2_clip_seed13 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 13 +init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt +init_strict: false +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 224 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage2_seed13.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage2_seed13.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 13 +optim: + epochs: 4 + batch_size: 2 + num_workers: 4 + lr: 0.0003 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: true + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 512 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: false + fusion: + hidden_dim: 512 + num_cameras: 3 + num_layers: 4 + num_heads: 8 + ff_dim: 2048 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 512 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 8 + max_history_steps: 8 + decoder: + hidden_dim: 512 + num_heads: 8 + num_layers: 4 + ff_dim: 2048 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 512 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 8 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 512 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 8 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 512 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 8 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.1 + arm_role: 0.15 + support_mode: 0.1 + corridor: 0.15 + persistence: 0.05 + disturbance: 0.05 + world_model: 0.25 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.25 + planner_risk: 0.1 + planner_ranking: 0.2 + proposal_reconstruction: 0.1 + proposal_success: 0.15 + proposal_ranking: 0.2 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..860433c8133514d5f5b604be4e8cd55f385902ab --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.2595419847328244, + "planner_regret": 0.015185066498816013, + "planner_score_utility_spearman": 0.2809160351753235, + "risk_calibration_mse": 0.010697935707867146, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.019719451665878296, + "left_right_equivariance_error": 8.677602234070726e-05, + "belief_calibration_brier": 0.003582377452403307, + "reocclusion_calibration_brier": 0.2486726939678192, + "support_stability_mae": 0.027683958411216736, + "clearance_auc": 0.8539042374111527, + "memory_write_rate": 0.49614080786705017, + "memory_saturation": 0.3391597867012024, + "num_samples": 131 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..675aff85763dd14e1f032de475fbe32a73b5e212 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.025199293658921592, + "arm_role": 0.031675430975462264, + "belief": 0.12093473198382478, + "clearance": 0.09368413742631673, + "corridor": 0.29680905555816073, + "disturbance": 0.007625889547575513, + "grasp_affordance": 0.023363290535972307, + "occluder_contact": 0.21423418019947252, + "persistence": 8.489773372286244, + "phase": 0.7337813527960526, + "planner_ranking": 0.23520062585716675, + "planner_risk": 0.015000962853235635, + "planner_success": 0.6204052362002824, + "proposal_diversity": 0.0, + "proposal_ranking": 1.2412571900769285, + "proposal_reconstruction": 0.0672246428108529, + "proposal_success": 0.6834057503624966, + "reocclusion": 0.7059739547340493, + "role_swap_consistency": 0.00044641466650462364, + "support_mode": 0.737896728515625, + "support_stability": 0.16598236134863997, + "total": 1.936751513732107, + "uncertainty": 0.02631602293505227, + "visibility": 0.12221070531951754, + "world_model": 2.6260432685676376 + }, + "val": { + "action": 0.02296490968684807, + "arm_role": 1.1920925544472993e-06, + "belief": 0.10552826659245924, + "clearance": 0.07981697961010716, + "corridor": 0.24074691330844705, + "disturbance": 0.0019879042129173977, + "grasp_affordance": 0.012804760837532354, + "occluder_contact": 0.20304674835819186, + "persistence": 4.831832351106586, + "phase": 0.662635090676221, + "planner_ranking": 0.04777729516111625, + "planner_risk": 0.011265802354142634, + "planner_success": 0.5608469446500143, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1612638379588271, + "proposal_reconstruction": 0.06499927355484529, + "proposal_success": 0.6768998079227678, + "reocclusion": 0.692740258845416, + "role_swap_consistency": 0.0, + "support_mode": 0.6566608846187592, + "support_stability": 0.15997966932076396, + "total": 1.5804176764054731, + "uncertainty": 0.012467421647725683, + "visibility": 0.09922279044985771, + "world_model": 2.3550273776054382 + } + }, + { + "epoch": 1, + "train": { + "action": 0.021963300938276867, + "arm_role": 6.080301184403269e-06, + "belief": 0.10263110273762753, + "clearance": 0.0788226080960349, + "corridor": 0.2412219915735094, + "disturbance": 0.002794332566535511, + "grasp_affordance": 0.009757642472456944, + "occluder_contact": 0.195604843920783, + "persistence": 4.26262659869696, + "phase": 0.6962530838815789, + "planner_ranking": 0.051491495151506236, + "planner_risk": 0.011504811691855521, + "planner_success": 0.5311845611584814, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1484619347672713, + "proposal_reconstruction": 0.06422912276497013, + "proposal_success": 0.6744128418596167, + "reocclusion": 0.7004858849864257, + "role_swap_consistency": 0.00023457101549291494, + "support_mode": 0.6761667351973685, + "support_stability": 0.1433959776927766, + "total": 1.493165501795317, + "uncertainty": 0.005074691738149053, + "visibility": 0.095918686060529, + "world_model": 2.146357953234723 + }, + "val": { + "action": 0.023318854075941173, + "arm_role": 1.2460903566203672e-05, + "belief": 0.09727730161764404, + "clearance": 0.07534228863589691, + "corridor": 0.23771371602108984, + "disturbance": 0.001875049049582837, + "grasp_affordance": 0.008910867576064034, + "occluder_contact": 0.1906791471622207, + "persistence": 3.784950184099602, + "phase": 0.6902159127322111, + "planner_ranking": 0.039764305716744275, + "planner_risk": 0.011417482169539047, + "planner_success": 0.5057139098644257, + "proposal_diversity": 0.0, + "proposal_ranking": 1.147210995356242, + "proposal_reconstruction": 0.06565308503129265, + "proposal_success": 0.6812662798346896, + "reocclusion": 0.6869303502819755, + "role_swap_consistency": 0.0, + "support_mode": 0.6485618005196253, + "support_stability": 0.14766556989740243, + "total": 1.4424566698796821, + "uncertainty": 0.00217234116809612, + "visibility": 0.08674176816235889, + "world_model": 2.087360879688552 + } + }, + { + "epoch": 2, + "train": { + "action": 0.021162556783345186, + "arm_role": 3.887634528310675e-05, + "belief": 0.12156015297299937, + "clearance": 0.08380865936020487, + "corridor": 0.24415273534893794, + "disturbance": 0.00342957377352401, + "grasp_affordance": 0.009738953835575988, + "occluder_contact": 0.2044433526302639, + "persistence": 2.334686749353373, + "phase": 0.4850014937551398, + "planner_ranking": 0.040600373610121955, + "planner_risk": 0.011128092848996043, + "planner_success": 0.5146951448760535, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1424914592190794, + "proposal_reconstruction": 0.06354390727846246, + "proposal_success": 0.6729260523068278, + "reocclusion": 0.45681651623331404, + "role_swap_consistency": 0.000693912741476915, + "support_mode": 0.3760432626071729, + "support_stability": 0.15264682037461746, + "total": 1.3047154269720378, + "uncertainty": 0.0018502298858421502, + "visibility": 0.10071343036466524, + "world_model": 2.0547038796700927 + }, + "val": { + "action": 0.021024575605141847, + "arm_role": 8.373512278494948e-06, + "belief": 0.14957294635700458, + "clearance": 0.07959625695013639, + "corridor": 0.23735206732244202, + "disturbance": 0.004530226309725549, + "grasp_affordance": 0.009394604938499855, + "occluder_contact": 0.20181630529237515, + "persistence": 1.9288715395060452, + "phase": 0.4471131846252264, + "planner_ranking": 0.032947048032920895, + "planner_risk": 0.010839967758246612, + "planner_success": 0.5091258653185584, + "proposal_diversity": 0.0, + "proposal_ranking": 1.133656098987117, + "proposal_reconstruction": 0.06369356336918744, + "proposal_success": 0.6579223636424902, + "reocclusion": 0.3941904430588086, + "role_swap_consistency": 0.0, + "support_mode": 0.22626511123257154, + "support_stability": 0.14589737135578285, + "total": 1.2399074046900778, + "uncertainty": 0.0006092997625246151, + "visibility": 0.0903791573011514, + "world_model": 1.9959143472440315 + } + }, + { + "epoch": 3, + "train": { + "action": 0.01603315635706837, + "arm_role": 0.002382638893629375, + "belief": 0.10478096155351714, + "clearance": 0.07576708702468558, + "corridor": 0.21942420092742204, + "disturbance": 0.0019073896570166414, + "grasp_affordance": 0.008396455439689912, + "occluder_contact": 0.1960402814965499, + "persistence": 1.2186123260438904, + "phase": 0.358337392305073, + "planner_ranking": 0.03375569848982483, + "planner_risk": 0.010724377139826845, + "planner_success": 0.4898807811893915, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1367171218520717, + "proposal_reconstruction": 0.05945674386855803, + "proposal_success": 0.6522450180430162, + "reocclusion": 0.28051841486564005, + "role_swap_consistency": 0.0010344118927222506, + "support_mode": 0.26600602300543535, + "support_stability": 0.13557514025780715, + "total": 1.1690542249303115, + "uncertainty": 0.0006959539541825458, + "visibility": 0.09422595846025568, + "world_model": 1.9601847686265643 + }, + "val": { + "action": 0.015446776827571519, + "arm_role": 9.393832596701313e-05, + "belief": 0.10828393223610791, + "clearance": 0.0738553563979539, + "corridor": 0.20814461167901754, + "disturbance": 0.0014511280261296743, + "grasp_affordance": 0.007996377563386253, + "occluder_contact": 0.1979678033879309, + "persistence": 0.8299788037935892, + "phase": 0.2583448259053209, + "planner_ranking": 0.03061466764219486, + "planner_risk": 0.011345374417336037, + "planner_success": 0.47457649852290296, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1192938667355161, + "proposal_reconstruction": 0.05910300570681239, + "proposal_success": 0.6505675640973178, + "reocclusion": 0.3073428579126344, + "role_swap_consistency": 0.0, + "support_mode": 0.23859061848347526, + "support_stability": 0.14255593956984353, + "total": 1.1266983756513307, + "uncertainty": 0.000752164227874759, + "visibility": 0.08751969581300562, + "world_model": 1.956845378333872 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..033fbd778155b678f8687fe7356a3aa5cb917fa3 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/summary.json @@ -0,0 +1,557 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage2_clip_seed13", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_clip_seed13/checkpoint_best.pt", + "final_train_total": 1.1690542249303115, + "final_val_total": 1.1266983756513307, + "train_time_sec": 147.0101616382599, + "peak_gpu_memory_mb": 1895.4541015625, + "num_train_samples": 380, + "num_val_samples": 131, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": { + "path": "/workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt", + "loaded_keys": 461, + "skipped_shape_mismatch_keys": [ + "memory.gru.weight_ih_l0", + "memory.gru.weight_hh_l0", + "memory.gru.bias_ih_l0", + "memory.gru.bias_hh_l0", + "memory.token_proj.0.weight", + "memory.token_proj.0.bias", + "memory.token_proj.1.weight", + "memory.token_proj.1.bias", + "decoder.actor_role_bias", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.linear1.weight", + "decoder.revealer_decoder.layers.0.linear1.bias", + "decoder.revealer_decoder.layers.0.linear2.weight", + "decoder.revealer_decoder.layers.0.linear2.bias", + "decoder.revealer_decoder.layers.0.norm1.weight", + "decoder.revealer_decoder.layers.0.norm1.bias", + "decoder.revealer_decoder.layers.0.norm2.weight", + "decoder.revealer_decoder.layers.0.norm2.bias", + "decoder.revealer_decoder.layers.0.norm3.weight", + "decoder.revealer_decoder.layers.0.norm3.bias", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.linear1.weight", + "decoder.revealer_decoder.layers.1.linear1.bias", + "decoder.revealer_decoder.layers.1.linear2.weight", + "decoder.revealer_decoder.layers.1.linear2.bias", + "decoder.revealer_decoder.layers.1.norm1.weight", + "decoder.revealer_decoder.layers.1.norm1.bias", + "decoder.revealer_decoder.layers.1.norm2.weight", + "decoder.revealer_decoder.layers.1.norm2.bias", + "decoder.revealer_decoder.layers.1.norm3.weight", + "decoder.revealer_decoder.layers.1.norm3.bias", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.linear1.weight", + "decoder.revealer_decoder.layers.2.linear1.bias", + "decoder.revealer_decoder.layers.2.linear2.weight", + "decoder.revealer_decoder.layers.2.linear2.bias", + "decoder.revealer_decoder.layers.2.norm1.weight", + "decoder.revealer_decoder.layers.2.norm1.bias", + "decoder.revealer_decoder.layers.2.norm2.weight", + "decoder.revealer_decoder.layers.2.norm2.bias", + "decoder.revealer_decoder.layers.2.norm3.weight", + "decoder.revealer_decoder.layers.2.norm3.bias", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.linear1.weight", + "decoder.revealer_decoder.layers.3.linear1.bias", + "decoder.revealer_decoder.layers.3.linear2.weight", + "decoder.revealer_decoder.layers.3.linear2.bias", + "decoder.revealer_decoder.layers.3.norm1.weight", + "decoder.revealer_decoder.layers.3.norm1.bias", + "decoder.revealer_decoder.layers.3.norm2.weight", + "decoder.revealer_decoder.layers.3.norm2.bias", + "decoder.revealer_decoder.layers.3.norm3.weight", + "decoder.revealer_decoder.layers.3.norm3.bias", + "decoder.actor_decoder.layers.0.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.linear1.weight", + "decoder.actor_decoder.layers.0.linear1.bias", + "decoder.actor_decoder.layers.0.linear2.weight", + "decoder.actor_decoder.layers.0.linear2.bias", + "decoder.actor_decoder.layers.0.norm1.weight", + "decoder.actor_decoder.layers.0.norm1.bias", + "decoder.actor_decoder.layers.0.norm2.weight", + "decoder.actor_decoder.layers.0.norm2.bias", + "decoder.actor_decoder.layers.0.norm3.weight", + "decoder.actor_decoder.layers.0.norm3.bias", + "decoder.actor_decoder.layers.1.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.linear1.weight", + "decoder.actor_decoder.layers.1.linear1.bias", + "decoder.actor_decoder.layers.1.linear2.weight", + "decoder.actor_decoder.layers.1.linear2.bias", + "decoder.actor_decoder.layers.1.norm1.weight", + "decoder.actor_decoder.layers.1.norm1.bias", + "decoder.actor_decoder.layers.1.norm2.weight", + "decoder.actor_decoder.layers.1.norm2.bias", + "decoder.actor_decoder.layers.1.norm3.weight", + "decoder.actor_decoder.layers.1.norm3.bias", + "decoder.actor_decoder.layers.2.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.linear1.weight", + "decoder.actor_decoder.layers.2.linear1.bias", + "decoder.actor_decoder.layers.2.linear2.weight", + "decoder.actor_decoder.layers.2.linear2.bias", + "decoder.actor_decoder.layers.2.norm1.weight", + "decoder.actor_decoder.layers.2.norm1.bias", + "decoder.actor_decoder.layers.2.norm2.weight", + "decoder.actor_decoder.layers.2.norm2.bias", + "decoder.actor_decoder.layers.2.norm3.weight", + "decoder.actor_decoder.layers.2.norm3.bias", + "decoder.actor_decoder.layers.3.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.linear1.weight", + "decoder.actor_decoder.layers.3.linear1.bias", + "decoder.actor_decoder.layers.3.linear2.weight", + "decoder.actor_decoder.layers.3.linear2.bias", + "decoder.actor_decoder.layers.3.norm1.weight", + "decoder.actor_decoder.layers.3.norm1.bias", + "decoder.actor_decoder.layers.3.norm2.weight", + "decoder.actor_decoder.layers.3.norm2.bias", + "decoder.actor_decoder.layers.3.norm3.weight", + "decoder.actor_decoder.layers.3.norm3.bias", + "decoder.revealer_mean.weight", + "decoder.revealer_mean.bias", + "decoder.revealer_log_std.weight", + "decoder.revealer_log_std.bias", + "decoder.actor_mean.weight", + "decoder.actor_mean.bias", + "decoder.actor_log_std.weight", + "decoder.actor_log_std.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias" + ], + "missing_keys": [ + "backbone.depth_adapter.depth_proj.0.weight", + "backbone.depth_adapter.depth_proj.0.bias", + "backbone.depth_adapter.depth_proj.1.weight", + "backbone.depth_adapter.depth_proj.1.bias", + "backbone.depth_adapter.depth_proj.3.weight", + "backbone.depth_adapter.depth_proj.3.bias", + "backbone.depth_adapter.geometry_proj.0.weight", + "backbone.depth_adapter.geometry_proj.0.bias", + "backbone.depth_adapter.geometry_proj.1.weight", + "backbone.depth_adapter.geometry_proj.1.bias", + "backbone.depth_adapter.camera_proj.0.weight", + "backbone.depth_adapter.camera_proj.0.bias", + "backbone.depth_adapter.camera_proj.1.weight", + "backbone.depth_adapter.camera_proj.1.bias", + "fusion.geometry_fusion.attn.in_proj_weight", + "fusion.geometry_fusion.attn.in_proj_bias", + "fusion.geometry_fusion.attn.out_proj.weight", + "fusion.geometry_fusion.attn.out_proj.bias", + "fusion.geometry_fusion.gate.0.weight", + "fusion.geometry_fusion.gate.0.bias", + "fusion.geometry_fusion.gate.1.weight", + "fusion.geometry_fusion.gate.1.bias", + "fusion.geometry_fusion.gate.3.weight", + "fusion.geometry_fusion.gate.3.bias", + "fusion.geometry_fusion.out.0.weight", + "fusion.geometry_fusion.out.0.bias", + "fusion.geometry_fusion.out.1.weight", + "fusion.geometry_fusion.out.1.bias", + "memory.scene_memory.position_embedding", + "memory.scene_memory.bank_queries", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear1.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear1.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear2.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear2.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm1.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm1.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm2.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm2.bias", + "memory.scene_memory.bank_attention.in_proj_weight", + "memory.scene_memory.bank_attention.in_proj_bias", + "memory.scene_memory.bank_attention.out_proj.weight", + "memory.scene_memory.bank_attention.out_proj.bias", + "memory.scene_memory.action_proj.0.weight", + "memory.scene_memory.action_proj.0.bias", + "memory.scene_memory.action_proj.1.weight", + "memory.scene_memory.action_proj.1.bias", + "memory.scene_memory.write_gate.0.weight", + "memory.scene_memory.write_gate.0.bias", + "memory.scene_memory.write_gate.1.weight", + "memory.scene_memory.write_gate.1.bias", + "memory.scene_memory.write_gate.3.weight", + "memory.scene_memory.write_gate.3.bias", + "memory.scene_memory.token_proj.0.weight", + "memory.scene_memory.token_proj.0.bias", + "memory.scene_memory.token_proj.1.weight", + "memory.scene_memory.token_proj.1.bias", + "memory.belief_memory.position_embedding", + "memory.belief_memory.bank_queries", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear1.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear1.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear2.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear2.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm1.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm1.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm2.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm2.bias", + "memory.belief_memory.bank_attention.in_proj_weight", + "memory.belief_memory.bank_attention.in_proj_bias", + "memory.belief_memory.bank_attention.out_proj.weight", + "memory.belief_memory.bank_attention.out_proj.bias", + "memory.belief_memory.action_proj.0.weight", + "memory.belief_memory.action_proj.0.bias", + "memory.belief_memory.action_proj.1.weight", + "memory.belief_memory.action_proj.1.bias", + "memory.belief_memory.write_gate.0.weight", + "memory.belief_memory.write_gate.0.bias", + "memory.belief_memory.write_gate.1.weight", + "memory.belief_memory.write_gate.1.bias", + "memory.belief_memory.write_gate.3.weight", + "memory.belief_memory.write_gate.3.bias", + "memory.belief_memory.token_proj.0.weight", + "memory.belief_memory.token_proj.0.bias", + "memory.belief_memory.token_proj.1.weight", + "memory.belief_memory.token_proj.1.bias", + "decoder.arm_decoder.layers.0.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.linear1.weight", + "decoder.arm_decoder.layers.0.linear1.bias", + "decoder.arm_decoder.layers.0.linear2.weight", + "decoder.arm_decoder.layers.0.linear2.bias", + "decoder.arm_decoder.layers.0.norm1.weight", + "decoder.arm_decoder.layers.0.norm1.bias", + "decoder.arm_decoder.layers.0.norm2.weight", + "decoder.arm_decoder.layers.0.norm2.bias", + "decoder.arm_decoder.layers.0.norm3.weight", + "decoder.arm_decoder.layers.0.norm3.bias", + "decoder.arm_decoder.layers.1.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.linear1.weight", + "decoder.arm_decoder.layers.1.linear1.bias", + "decoder.arm_decoder.layers.1.linear2.weight", + "decoder.arm_decoder.layers.1.linear2.bias", + "decoder.arm_decoder.layers.1.norm1.weight", + "decoder.arm_decoder.layers.1.norm1.bias", + "decoder.arm_decoder.layers.1.norm2.weight", + "decoder.arm_decoder.layers.1.norm2.bias", + "decoder.arm_decoder.layers.1.norm3.weight", + "decoder.arm_decoder.layers.1.norm3.bias", + "decoder.arm_decoder.layers.2.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.linear1.weight", + "decoder.arm_decoder.layers.2.linear1.bias", + "decoder.arm_decoder.layers.2.linear2.weight", + "decoder.arm_decoder.layers.2.linear2.bias", + "decoder.arm_decoder.layers.2.norm1.weight", + "decoder.arm_decoder.layers.2.norm1.bias", + "decoder.arm_decoder.layers.2.norm2.weight", + "decoder.arm_decoder.layers.2.norm2.bias", + "decoder.arm_decoder.layers.2.norm3.weight", + "decoder.arm_decoder.layers.2.norm3.bias", + "decoder.arm_decoder.layers.3.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.linear1.weight", + "decoder.arm_decoder.layers.3.linear1.bias", + "decoder.arm_decoder.layers.3.linear2.weight", + "decoder.arm_decoder.layers.3.linear2.bias", + "decoder.arm_decoder.layers.3.norm1.weight", + "decoder.arm_decoder.layers.3.norm1.bias", + "decoder.arm_decoder.layers.3.norm2.weight", + "decoder.arm_decoder.layers.3.norm2.bias", + "decoder.arm_decoder.layers.3.norm3.weight", + "decoder.arm_decoder.layers.3.norm3.bias", + "decoder.arm_identity.weight", + "decoder.phase_adapter.weight", + "decoder.phase_adapter.bias", + "decoder.role_adapter.weight", + "decoder.role_adapter.bias", + "decoder.context_proj.0.weight", + "decoder.context_proj.0.bias", + "decoder.context_proj.1.weight", + "decoder.context_proj.1.bias", + "decoder.arm_head.0.weight", + "decoder.arm_head.0.bias", + "decoder.arm_head.1.weight", + "decoder.arm_head.1.bias", + "decoder.arm_mean.weight", + "decoder.arm_mean.bias", + "decoder.arm_log_std.weight", + "decoder.arm_log_std.bias", + "decoder.proposal_mode_head.0.weight", + "decoder.proposal_mode_head.0.bias", + "decoder.proposal_mode_head.1.weight", + "decoder.proposal_mode_head.1.bias", + "decoder.proposal_mode_head.3.weight", + "decoder.proposal_mode_head.3.bias", + "decoder.proposal_mode_embeddings.weight", + "decoder.proposal_slot_embeddings.weight", + "decoder.mode_residual_heads.0.0.weight", + "decoder.mode_residual_heads.0.0.bias", + "decoder.mode_residual_heads.0.1.weight", + "decoder.mode_residual_heads.0.1.bias", + "decoder.mode_residual_heads.0.3.weight", + "decoder.mode_residual_heads.0.3.bias", + "decoder.mode_residual_heads.1.0.weight", + "decoder.mode_residual_heads.1.0.bias", + "decoder.mode_residual_heads.1.1.weight", + "decoder.mode_residual_heads.1.1.bias", + "decoder.mode_residual_heads.1.3.weight", + "decoder.mode_residual_heads.1.3.bias", + "decoder.mode_residual_heads.2.0.weight", + "decoder.mode_residual_heads.2.0.bias", + "decoder.mode_residual_heads.2.1.weight", + "decoder.mode_residual_heads.2.1.bias", + "decoder.mode_residual_heads.2.3.weight", + "decoder.mode_residual_heads.2.3.bias", + "decoder.mode_residual_heads.3.0.weight", + "decoder.mode_residual_heads.3.0.bias", + "decoder.mode_residual_heads.3.1.weight", + "decoder.mode_residual_heads.3.1.bias", + "decoder.mode_residual_heads.3.3.weight", + "decoder.mode_residual_heads.3.3.bias", + "decoder.mode_residual_heads.4.0.weight", + "decoder.mode_residual_heads.4.0.bias", + "decoder.mode_residual_heads.4.1.weight", + "decoder.mode_residual_heads.4.1.bias", + "decoder.mode_residual_heads.4.3.weight", + "decoder.mode_residual_heads.4.3.bias", + "decoder.mode_residual_heads.5.0.weight", + "decoder.mode_residual_heads.5.0.bias", + "decoder.mode_residual_heads.5.1.weight", + "decoder.mode_residual_heads.5.1.bias", + "decoder.mode_residual_heads.5.3.weight", + "decoder.mode_residual_heads.5.3.bias", + "decoder.slot_delta.0.weight", + "decoder.slot_delta.0.bias", + "decoder.slot_delta.1.weight", + "decoder.slot_delta.1.bias", + "decoder.slot_delta.3.weight", + "decoder.slot_delta.3.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias", + "decoder.proposal_score.3.weight", + "decoder.proposal_score.3.bias", + "elastic_state_head.interaction_queries", + "elastic_state_head.interaction_attention.in_proj_weight", + "elastic_state_head.interaction_attention.in_proj_bias", + "elastic_state_head.interaction_attention.out_proj.weight", + "elastic_state_head.interaction_attention.out_proj.bias", + "elastic_state_head.interaction_mlp.0.weight", + "elastic_state_head.interaction_mlp.0.bias", + "elastic_state_head.interaction_mlp.1.weight", + "elastic_state_head.interaction_mlp.1.bias", + "elastic_state_head.interaction_mlp.3.weight", + "elastic_state_head.interaction_mlp.3.bias", + "elastic_state_head.decoder.field_queries", + "elastic_state_head.decoder.field_attention.in_proj_weight", + "elastic_state_head.decoder.field_attention.in_proj_bias", + "elastic_state_head.decoder.field_attention.out_proj.weight", + "elastic_state_head.decoder.field_attention.out_proj.bias", + "elastic_state_head.decoder.field_mlp.0.weight", + "elastic_state_head.decoder.field_mlp.0.bias", + "elastic_state_head.decoder.field_mlp.1.weight", + "elastic_state_head.decoder.field_mlp.1.bias", + "elastic_state_head.decoder.field_mlp.3.weight", + "elastic_state_head.decoder.field_mlp.3.bias", + "elastic_state_head.decoder.summary_proj.0.weight", + "elastic_state_head.decoder.summary_proj.0.bias", + "elastic_state_head.decoder.summary_proj.1.weight", + "elastic_state_head.decoder.summary_proj.1.bias", + "elastic_state_head.decoder.phase_head.0.weight", + "elastic_state_head.decoder.phase_head.0.bias", + "elastic_state_head.decoder.phase_head.1.weight", + "elastic_state_head.decoder.phase_head.1.bias", + "elastic_state_head.decoder.phase_head.3.weight", + "elastic_state_head.decoder.phase_head.3.bias", + "elastic_state_head.decoder.arm_role_head.0.weight", + "elastic_state_head.decoder.arm_role_head.0.bias", + "elastic_state_head.decoder.arm_role_head.1.weight", + "elastic_state_head.decoder.arm_role_head.1.bias", + "elastic_state_head.decoder.arm_role_head.3.weight", + "elastic_state_head.decoder.arm_role_head.3.bias", + "elastic_state_head.decoder.arm_identity.weight", + "elastic_state_head.decoder.support_mode.0.weight", + "elastic_state_head.decoder.support_mode.0.bias", + "elastic_state_head.decoder.support_mode.1.weight", + "elastic_state_head.decoder.support_mode.1.bias", + "elastic_state_head.decoder.support_mode.3.weight", + "elastic_state_head.decoder.support_mode.3.bias", + "elastic_state_head.decoder.access_field.weight", + "elastic_state_head.decoder.access_field.bias", + "elastic_state_head.decoder.target_belief_field.weight", + "elastic_state_head.decoder.target_belief_field.bias", + "elastic_state_head.decoder.visibility_field.weight", + "elastic_state_head.decoder.visibility_field.bias", + "elastic_state_head.decoder.clearance_field.weight", + "elastic_state_head.decoder.clearance_field.bias", + "elastic_state_head.decoder.occluder_contact_field.weight", + "elastic_state_head.decoder.occluder_contact_field.bias", + "elastic_state_head.decoder.grasp_affordance_field.weight", + "elastic_state_head.decoder.grasp_affordance_field.bias", + "elastic_state_head.decoder.support_stability_field.weight", + "elastic_state_head.decoder.support_stability_field.bias", + "elastic_state_head.decoder.persistence_field.weight", + "elastic_state_head.decoder.persistence_field.bias", + "elastic_state_head.decoder.reocclusion_field.weight", + "elastic_state_head.decoder.reocclusion_field.bias", + "elastic_state_head.decoder.disturbance_field.weight", + "elastic_state_head.decoder.disturbance_field.bias", + "elastic_state_head.decoder.uncertainty_field.weight", + "elastic_state_head.decoder.uncertainty_field.bias", + "elastic_state_head.decoder.reocclusion_head.0.weight", + "elastic_state_head.decoder.reocclusion_head.0.bias", + "elastic_state_head.decoder.reocclusion_head.1.weight", + "elastic_state_head.decoder.reocclusion_head.1.bias", + "elastic_state_head.decoder.reocclusion_head.3.weight", + "elastic_state_head.decoder.reocclusion_head.3.bias", + "world_model.state_encoder.0.weight", + "world_model.state_encoder.0.bias", + "world_model.state_encoder.1.weight", + "world_model.state_encoder.1.bias", + "world_model.scene_memory_proj.0.weight", + "world_model.scene_memory_proj.0.bias", + "world_model.scene_memory_proj.1.weight", + "world_model.scene_memory_proj.1.bias", + "world_model.belief_memory_proj.0.weight", + "world_model.belief_memory_proj.0.bias", + "world_model.belief_memory_proj.1.weight", + "world_model.belief_memory_proj.1.bias", + "world_model.action_encoder.0.weight", + "world_model.action_encoder.0.bias", + "world_model.action_encoder.1.weight", + "world_model.action_encoder.1.bias", + "world_model.transition.weight_ih", + "world_model.transition.weight_hh", + "world_model.transition.bias_ih", + "world_model.transition.bias_hh", + "world_model.scene_memory_update.weight", + "world_model.scene_memory_update.bias", + "world_model.belief_memory_update.weight", + "world_model.belief_memory_update.bias", + "world_model.compact_decoder.weight", + "world_model.compact_decoder.bias", + "world_model.target_belief_head.weight", + "world_model.target_belief_head.bias", + "world_model.visibility_head.weight", + "world_model.visibility_head.bias", + "world_model.clearance_head.weight", + "world_model.clearance_head.bias", + "world_model.occluder_contact_head.weight", + "world_model.occluder_contact_head.bias", + "world_model.grasp_affordance_head.weight", + "world_model.grasp_affordance_head.bias", + "world_model.support_stability_head.weight", + "world_model.support_stability_head.bias", + "world_model.persistence_head.weight", + "world_model.persistence_head.bias", + "world_model.reocclusion_head.weight", + "world_model.reocclusion_head.bias", + "world_model.disturbance_head.weight", + "world_model.disturbance_head.bias", + "world_model.uncertainty_head.weight", + "world_model.uncertainty_head.bias", + "world_model.access_head.weight", + "world_model.access_head.bias", + "planner.residual.trunk.0.weight", + "planner.residual.trunk.0.bias", + "planner.residual.trunk.1.weight", + "planner.residual.trunk.1.bias", + "planner.residual.trunk.3.weight", + "planner.residual.trunk.3.bias", + "planner.residual.success_head.weight", + "planner.residual.success_head.bias", + "planner.residual.risk_head.weight", + "planner.residual.risk_head.bias", + "planner.residual.residual_head.weight", + "planner.residual.residual_head.bias" + ], + "unexpected_keys": [] + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..6fd5848efcbbe39e82be21e008516f85685c2ba6 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5416666666666666, + "visibility_integral": 34.41302740573883, + "corridor_availability": 0.8933400412400564, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.6405131230011083, + "disturbance_cost": 0.3704787661942343 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..0a676fcb3a3d8a67b8789c72105d6e6d3e761125 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/checkpoint_best.pt +- mean_success: 0.542 +- visibility_integral: 34.413 +- corridor_availability: 0.893 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.641 +- disturbance_cost: 0.370 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_no_world_model/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_no_world_model/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..5e0acfb62535ce075397944fd468896a21789891 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_no_world_model/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5416666666666666, + "visibility_integral": 34.65096331967248, + "corridor_availability": 0.8933400412400564, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.6348470987268464, + "disturbance_cost": 0.36164701517878306 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_no_world_model/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_no_world_model/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..194c5e573e9be06fc7e72df3878f9f88af9f88e2 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_no_world_model/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/checkpoint_best.pt +- mean_success: 0.542 +- visibility_integral: 34.651 +- corridor_availability: 0.893 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.635 +- disturbance_cost: 0.362 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_short_history/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_short_history/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..d51664ca4637941a7beec387972ca255e3902cd8 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_short_history/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5416666666666666, + "visibility_integral": 34.41317194037967, + "corridor_availability": 0.8933400412400564, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.639803415858654, + "disturbance_cost": 0.37048843161513406 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_short_history/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_short_history/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..516e5d7183a1aa9ef6ad58011f3a59dcdac495e2 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/benchmark_short_history/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/checkpoint_best.pt +- mean_success: 0.542 +- visibility_integral: 34.413 +- corridor_availability: 0.893 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.640 +- disturbance_cost: 0.370 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a696d5c3f8c9d9af76f2eebe257f06c3312751b3 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/config_resolved.yaml @@ -0,0 +1,148 @@ +experiment_name: proxy_interaction_r3d_stage2_dummy_seed21 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 21 +defaults: [] +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 96 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_v6_rgbd_stage2_dummy_seed21.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_v6_rgbd_stage2_dummy_seed21.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 21 +optim: + epochs: 4 + batch_size: 16 + num_workers: 4 + lr: 0.001 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: false + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 192 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: true + fusion: + hidden_dim: 192 + num_cameras: 3 + num_layers: 2 + num_heads: 4 + ff_dim: 384 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 192 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 4 + max_history_steps: 8 + decoder: + hidden_dim: 192 + num_heads: 4 + num_layers: 2 + ff_dim: 384 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 192 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 4 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 192 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 4 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 192 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 4 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.15 + arm_role: 0.2 + support_mode: 0.15 + corridor: 0.2 + persistence: 0.1 + disturbance: 0.1 + world_model: 0.3 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.2 + planner_risk: 0.1 + planner_ranking: 0.1 + proposal_reconstruction: 0.2 + proposal_success: 0.1 + proposal_ranking: 0.1 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..a05ce1c9f2fb8448f6fad1142589b68c4b059afb --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.3383458646616541, + "planner_regret": 0.020659049972891808, + "planner_score_utility_spearman": 0.2586466372013092, + "risk_calibration_mse": 0.011588108725845814, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.026253661140799522, + "left_right_equivariance_error": 0.007271398872356205, + "belief_calibration_brier": 0.004160370212048292, + "reocclusion_calibration_brier": 0.2820528745651245, + "support_stability_mae": 0.030557002872228622, + "clearance_auc": 0.9069614725933284, + "memory_write_rate": 0.0, + "memory_saturation": 0.6733582615852356, + "num_samples": 133 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..8e65e21653831dd022ca59e262a61e8ed02b9091 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.029067292887096603, + "arm_role": 0.2128272018841623, + "belief": 0.23175121502329907, + "clearance": 0.1794816708813111, + "corridor": 0.2991743894914786, + "disturbance": 0.014563722050903985, + "grasp_affordance": 0.11285659003381927, + "occluder_contact": 0.2981356270611286, + "persistence": 5.024227797985077, + "phase": 0.736465490112702, + "planner_ranking": 0.7001801505684853, + "planner_risk": 0.029345064676211525, + "planner_success": 0.6331901401281357, + "proposal_diversity": 0.0, + "proposal_ranking": 1.2601740161577861, + "proposal_reconstruction": 0.07138338964432478, + "proposal_success": 0.6750835478305817, + "reocclusion": 0.6895125756661097, + "role_swap_consistency": 0.0008129292400553823, + "support_mode": 0.7214357455571493, + "support_stability": 0.18610862642526627, + "total": 2.5757969667514167, + "uncertainty": 0.16812690005948147, + "visibility": 0.17425233901788792, + "world_model": 4.0634838839372 + }, + "val": { + "action": 0.02386013480524222, + "arm_role": 0.0004076675427818878, + "belief": 0.1069209881954723, + "clearance": 0.08219879203372532, + "corridor": 0.2415692475106981, + "disturbance": 0.0030337396116616824, + "grasp_affordance": 0.01165291853249073, + "occluder_contact": 0.22314749823676217, + "persistence": 4.605164660347833, + "phase": 0.8142086532380846, + "planner_ranking": 0.5563494629330106, + "planner_risk": 0.011604948745419582, + "planner_success": 0.6387051675054762, + "proposal_diversity": 0.0, + "proposal_ranking": 1.2504341999689739, + "proposal_reconstruction": 0.06713124199046029, + "proposal_success": 0.6774384710523818, + "reocclusion": 0.7591080533133613, + "role_swap_consistency": 0.0, + "support_mode": 0.7771240539020963, + "support_stability": 0.135693629582723, + "total": 2.1388481987847223, + "uncertainty": 0.015495387733810477, + "visibility": 0.09591657254430982, + "world_model": 3.0181201563941107 + } + }, + { + "epoch": 1, + "train": { + "action": 0.020422046072781086, + "arm_role": 0.00013267093800095608, + "belief": 0.132033076758186, + "clearance": 0.09306831813106935, + "corridor": 0.2473244791229566, + "disturbance": 0.005267159331803366, + "grasp_affordance": 0.02323731636473288, + "occluder_contact": 0.2274861807624499, + "persistence": 4.788148105144501, + "phase": 0.4897861474504073, + "planner_ranking": 0.19803702970966697, + "planner_risk": 0.014476059819571674, + "planner_success": 0.582294854025046, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1768548240264256, + "proposal_reconstruction": 0.06309070127705733, + "proposal_success": 0.6687473853429159, + "reocclusion": 0.4347735444704692, + "role_swap_consistency": 0.0005246425753284711, + "support_mode": 0.27536690221556154, + "support_stability": 0.13952944738169512, + "total": 1.8044419437646866, + "uncertainty": 0.02774027381868412, + "visibility": 0.1153421513736248, + "world_model": 2.4898271610339484 + }, + "val": { + "action": 0.01874730870541599, + "arm_role": 5.6157629943401036e-05, + "belief": 0.10553244915273455, + "clearance": 0.07688990897602505, + "corridor": 0.22811337808767954, + "disturbance": 0.003250152357698729, + "grasp_affordance": 0.01229651603433821, + "occluder_contact": 0.2126419097185135, + "persistence": 4.407040860917833, + "phase": 0.5402041557762358, + "planner_ranking": 0.057698477473523885, + "planner_risk": 0.018357175298862986, + "planner_success": 0.5312860574987199, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1326524019241333, + "proposal_reconstruction": 0.061598031471172966, + "proposal_success": 0.684064143233829, + "reocclusion": 0.30786263280444676, + "role_swap_consistency": 0.0, + "support_mode": 0.004251669186891781, + "support_stability": 0.13853448629379272, + "total": 1.6857457160949707, + "uncertainty": 0.01232649458365308, + "visibility": 0.09530285745859146, + "world_model": 2.4774555497699313 + } + }, + { + "epoch": 2, + "train": { + "action": 0.016114005508522194, + "arm_role": 0.0001030681860356708, + "belief": 0.10320375890781482, + "clearance": 0.07995640703787406, + "corridor": 0.25392253262301284, + "disturbance": 0.0031722914403265654, + "grasp_affordance": 0.012748630911422273, + "occluder_contact": 0.2107334186633428, + "persistence": 2.4088165710369744, + "phase": 0.4625267634789149, + "planner_ranking": 0.060266673332080245, + "planner_risk": 0.012258843247157833, + "planner_success": 0.5274426229298115, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1385845243930817, + "proposal_reconstruction": 0.05936284347747763, + "proposal_success": 0.6725146919488907, + "reocclusion": 0.2462632873406013, + "role_swap_consistency": 0.0006072094838600606, + "support_mode": 0.0016275297427152207, + "support_stability": 0.1426111270363132, + "total": 1.3764432966709137, + "uncertainty": 0.009469694186312458, + "visibility": 0.09597749076783657, + "world_model": 2.160929208000501 + }, + "val": { + "action": 0.017381828278303146, + "arm_role": 0.00010448855997916932, + "belief": 0.10097876108354992, + "clearance": 0.07277507541908158, + "corridor": 0.2505771385298835, + "disturbance": 0.0016975371917295787, + "grasp_affordance": 0.009771786112752225, + "occluder_contact": 0.21183227002620697, + "persistence": 2.4857726428243847, + "phase": 0.4415881070825789, + "planner_ranking": 0.050767497469981514, + "planner_risk": 0.012091901567247178, + "planner_success": 0.5431661009788513, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1517571475770738, + "proposal_reconstruction": 0.06106388237741259, + "proposal_success": 0.6686976816919115, + "reocclusion": 0.3017841925223668, + "role_swap_consistency": 0.0, + "support_mode": 0.0003194726822483871, + "support_stability": 0.138791523873806, + "total": 1.3918594784206815, + "uncertainty": 0.006160195347749525, + "visibility": 0.09356896413697137, + "world_model": 2.1786467101838856 + } + }, + { + "epoch": 3, + "train": { + "action": 0.014575915721555551, + "arm_role": 0.00010951897911581909, + "belief": 0.11138264213999112, + "clearance": 0.08021063978473346, + "corridor": 0.22665666664640108, + "disturbance": 0.001938682675245218, + "grasp_affordance": 0.009700370137579739, + "occluder_contact": 0.21799744479358196, + "persistence": 1.6732217147946358, + "phase": 0.44999681537350017, + "planner_ranking": 0.042192295814553894, + "planner_risk": 0.01132670590110744, + "planner_success": 0.5080402580400308, + "proposal_diversity": 0.0, + "proposal_ranking": 1.137233888109525, + "proposal_reconstruction": 0.058159296245624624, + "proposal_success": 0.6529582714041074, + "reocclusion": 0.25044785129527253, + "role_swap_consistency": 0.0005161711233085953, + "support_mode": 0.00018608797411919417, + "support_stability": 0.13345634875198206, + "total": 1.2379883875449498, + "uncertainty": 0.0046325789056330295, + "visibility": 0.10680994981278975, + "world_model": 1.9994410425424576 + }, + "val": { + "action": 0.016705242089099355, + "arm_role": 4.718890462148314e-05, + "belief": 0.09792536165979174, + "clearance": 0.07563622544209163, + "corridor": 0.2376250127951304, + "disturbance": 0.002239807761119058, + "grasp_affordance": 0.008206432374815146, + "occluder_contact": 0.2136789427863227, + "persistence": 2.978070444530911, + "phase": 0.4751303195953369, + "planner_ranking": 0.0338772117263741, + "planner_risk": 0.011766589557131132, + "planner_success": 0.5005052321487002, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1342882580227323, + "proposal_reconstruction": 0.05980717432167795, + "proposal_success": 0.639495485358768, + "reocclusion": 0.3352541989750332, + "role_swap_consistency": 0.0, + "support_mode": 0.0004235156811773777, + "support_stability": 0.13641884757412803, + "total": 1.3906548553042941, + "uncertainty": 0.0036365572466618484, + "visibility": 0.09763797538148032, + "world_model": 2.049238271183438 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..7c1dda4ef4a72349adbabcbcc66b922b15fc5708 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/summary.json @@ -0,0 +1,14 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage2_dummy_seed21", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed21/checkpoint_best.pt", + "final_train_total": 1.2379883875449498, + "final_val_total": 1.3906548553042941, + "train_time_sec": 18.177103996276855, + "peak_gpu_memory_mb": 639.55078125, + "num_train_samples": 379, + "num_val_samples": 133, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": null +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..947d338b656b32b71125643764b3a014a2eaab3a --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.5, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5694444444444443, + "visibility_integral": 33.861522571908104, + "corridor_availability": 0.8863558504316542, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 1.6200438848336538, + "disturbance_cost": 0.2896964028477669 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..fe4be4781986b09dfe5520aa344d81ec34ebf12d --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/checkpoint_best.pt +- mean_success: 0.569 +- visibility_integral: 33.862 +- corridor_availability: 0.886 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 1.620 +- disturbance_cost: 0.290 +- foliage_proxy_success: 0.500 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_no_world_model/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_no_world_model/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..947d338b656b32b71125643764b3a014a2eaab3a --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_no_world_model/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.5, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5694444444444443, + "visibility_integral": 33.861522571908104, + "corridor_availability": 0.8863558504316542, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 1.6200438848336538, + "disturbance_cost": 0.2896964028477669 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_no_world_model/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_no_world_model/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..fe4be4781986b09dfe5520aa344d81ec34ebf12d --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_no_world_model/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/checkpoint_best.pt +- mean_success: 0.569 +- visibility_integral: 33.862 +- corridor_availability: 0.886 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 1.620 +- disturbance_cost: 0.290 +- foliage_proxy_success: 0.500 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_short_history/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_short_history/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..fba9642f7ad6d3aae7ea336a3cb3a4f04b98514e --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_short_history/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.5, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5694444444444443, + "visibility_integral": 33.86345969637235, + "corridor_availability": 0.8863558504316542, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 1.6183116247653961, + "disturbance_cost": 0.2896275156591501 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_short_history/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_short_history/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..fe6f5e2dbd42ce79b21ef8f9b16706a50240f94c --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/benchmark_short_history/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/checkpoint_best.pt +- mean_success: 0.569 +- visibility_integral: 33.863 +- corridor_availability: 0.886 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 1.618 +- disturbance_cost: 0.290 +- foliage_proxy_success: 0.500 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..120a05c579a02cb5837dea6e0e837736552231bf --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/config_resolved.yaml @@ -0,0 +1,148 @@ +experiment_name: proxy_interaction_r3d_stage2_dummy_seed22 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 22 +defaults: [] +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 96 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_v6_rgbd_stage2_dummy_seed22.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_v6_rgbd_stage2_dummy_seed22.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 22 +optim: + epochs: 4 + batch_size: 16 + num_workers: 4 + lr: 0.001 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: false + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 192 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: true + fusion: + hidden_dim: 192 + num_cameras: 3 + num_layers: 2 + num_heads: 4 + ff_dim: 384 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 192 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 4 + max_history_steps: 8 + decoder: + hidden_dim: 192 + num_heads: 4 + num_layers: 2 + ff_dim: 384 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 192 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 4 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 192 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 4 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 192 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 4 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.15 + arm_role: 0.2 + support_mode: 0.15 + corridor: 0.2 + persistence: 0.1 + disturbance: 0.1 + world_model: 0.3 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.2 + planner_risk: 0.1 + planner_ranking: 0.1 + proposal_reconstruction: 0.2 + proposal_success: 0.1 + proposal_ranking: 0.1 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..566ac5b17916240f8b9b679ad9ac829fd56c67c1 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.3484848484848485, + "planner_regret": 0.020695989951491356, + "planner_score_utility_spearman": 0.23636364936828613, + "risk_calibration_mse": 0.011909244582057, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.022221416234970093, + "left_right_equivariance_error": 0.00428396016907166, + "belief_calibration_brier": 0.004661242943257093, + "reocclusion_calibration_brier": 0.2808501124382019, + "support_stability_mae": 0.023243192583322525, + "clearance_auc": 0.8644590429594041, + "memory_write_rate": 0.0, + "memory_saturation": 0.765249490737915, + "num_samples": 132 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..4ca12a531d738ea6971cfb1e1253cebc6ac6053d --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.03347124446493884, + "arm_role": 0.20229130648294813, + "belief": 0.17402074641237655, + "clearance": 0.19880834439148506, + "corridor": 0.2891631244371335, + "disturbance": 0.018553439459841076, + "grasp_affordance": 0.11385511832001309, + "occluder_contact": 0.29940443734327954, + "persistence": 5.160142799218495, + "phase": 0.8644107232491175, + "planner_ranking": 0.6737854778766632, + "planner_risk": 0.03873047609037409, + "planner_success": 0.6520731473962466, + "proposal_diversity": 0.0, + "proposal_ranking": 1.3132477800051372, + "proposal_reconstruction": 0.07480817381292582, + "proposal_success": 0.6795827721556028, + "reocclusion": 0.7054086849093437, + "role_swap_consistency": 0.0006326730472210329, + "support_mode": 0.6758991243938605, + "support_stability": 0.20185439257572094, + "total": 2.653773923714956, + "uncertainty": 0.17960463898877302, + "visibility": 0.18138946779072285, + "world_model": 4.2053997417291 + }, + "val": { + "action": 0.025188560287157696, + "arm_role": 0.00023987682490971766, + "belief": 0.13046854072146946, + "clearance": 0.0943274630440606, + "corridor": 0.25146762364440495, + "disturbance": 0.003108833476694094, + "grasp_affordance": 0.025106851425435808, + "occluder_contact": 0.25706369678179425, + "persistence": 5.160573641459147, + "phase": 0.6966154111756219, + "planner_ranking": 0.5090681347582076, + "planner_risk": 0.01417370161248578, + "planner_success": 0.6582367420196533, + "proposal_diversity": 0.0, + "proposal_ranking": 1.263387746281094, + "proposal_reconstruction": 0.06713862634367412, + "proposal_success": 0.6767676009072198, + "reocclusion": 0.6798703074455261, + "role_swap_consistency": 0.0, + "support_mode": 0.6764178209834628, + "support_stability": 0.13747453110085595, + "total": 2.237161636352539, + "uncertainty": 0.05560384856330024, + "visibility": 0.09796598636441761, + "world_model": 3.2540784147050648 + } + }, + { + "epoch": 1, + "train": { + "action": 0.020618916450378794, + "arm_role": 7.443887670888216e-05, + "belief": 0.12020893146594365, + "clearance": 0.09807458023230235, + "corridor": 0.24780173785984516, + "disturbance": 0.003880485649763917, + "grasp_affordance": 0.024978001097527642, + "occluder_contact": 0.23083883275588354, + "persistence": 4.817646026611328, + "phase": 0.4601554498076439, + "planner_ranking": 0.19355946235979596, + "planner_risk": 0.015518942285173884, + "planner_success": 0.6042056332031885, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1747117042541504, + "proposal_reconstruction": 0.06312885119890173, + "proposal_success": 0.6667466511329015, + "reocclusion": 0.3677233246465524, + "role_swap_consistency": 0.0004291342059635402, + "support_mode": 0.08244437400693035, + "support_stability": 0.13558734022080898, + "total": 1.7778482685486476, + "uncertainty": 0.04317541288522383, + "visibility": 0.11643363380183776, + "world_model": 2.5014847815036774 + }, + "val": { + "action": 0.019934887687365215, + "arm_role": 0.00020918159215297137, + "belief": 0.10726906028058794, + "clearance": 0.08435270521375868, + "corridor": 0.24421080119080013, + "disturbance": 0.0025649187963507655, + "grasp_affordance": 0.013776088029974036, + "occluder_contact": 0.22032455106576285, + "persistence": 4.736663394504124, + "phase": 0.4386194712585873, + "planner_ranking": 0.05592367466953066, + "planner_risk": 0.01741992651174466, + "planner_success": 0.5965519547462463, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1303200324376423, + "proposal_reconstruction": 0.06245918033851518, + "proposal_success": 0.6803573237525092, + "reocclusion": 0.300288421412309, + "role_swap_consistency": 0.0, + "support_mode": 0.0014197466015401813, + "support_stability": 0.13595510439740288, + "total": 1.7569248808754816, + "uncertainty": 0.017266521230340004, + "visibility": 0.09970718456639184, + "world_model": 2.5990555551317005 + } + }, + { + "epoch": 2, + "train": { + "action": 0.01646357006393373, + "arm_role": 0.00032456668426069274, + "belief": 0.11667600863923629, + "clearance": 0.0851635920504729, + "corridor": 0.2447526715695858, + "disturbance": 0.003258141950936988, + "grasp_affordance": 0.012976687673168877, + "occluder_contact": 0.21284440780679384, + "persistence": 2.953347126642863, + "phase": 0.44309895547727746, + "planner_ranking": 0.04747697835167249, + "planner_risk": 0.016302392507592838, + "planner_success": 0.5193743904431661, + "proposal_diversity": 0.0, + "proposal_ranking": 1.150391474366188, + "proposal_reconstruction": 0.05955767119303346, + "proposal_success": 0.6713465626041094, + "reocclusion": 0.2378139222661654, + "role_swap_consistency": 0.0005029737524940477, + "support_mode": 0.0011681052292260574, + "support_stability": 0.13539936766028404, + "total": 1.4113694926102955, + "uncertainty": 0.014864409691654146, + "visibility": 0.10114136214057605, + "world_model": 2.113257105151812 + }, + "val": { + "action": 0.01689246390014887, + "arm_role": 0.0002562762076397323, + "belief": 0.12067249417304993, + "clearance": 0.08075836963123745, + "corridor": 0.2332237097952101, + "disturbance": 0.0030973186884592804, + "grasp_affordance": 0.009670139031691683, + "occluder_contact": 0.19927391078737047, + "persistence": 2.144443233807882, + "phase": 0.4981871048609416, + "planner_ranking": 0.04188345455461078, + "planner_risk": 0.015286814835336473, + "planner_success": 0.5075024200810326, + "proposal_diversity": 0.0, + "proposal_ranking": 1.142343070771959, + "proposal_reconstruction": 0.0599246294134193, + "proposal_success": 0.6811430851618449, + "reocclusion": 0.31094949195782345, + "role_swap_consistency": 0.0, + "support_mode": 0.00042427888709223934, + "support_stability": 0.13746210518810484, + "total": 1.4014967216385736, + "uncertainty": 0.002499298451261388, + "visibility": 0.1028875137368838, + "world_model": 2.329009042845832 + } + }, + { + "epoch": 3, + "train": { + "action": 0.0155420743006592, + "arm_role": 0.00015497119996628803, + "belief": 0.11977876101930936, + "clearance": 0.07912964109952252, + "corridor": 0.23581945523619652, + "disturbance": 0.002367413486354053, + "grasp_affordance": 0.009577435072666654, + "occluder_contact": 0.2115720814714829, + "persistence": 1.9874264548222225, + "phase": 0.45408404618501663, + "planner_ranking": 0.038977843476459384, + "planner_risk": 0.011543226932796339, + "planner_success": 0.5249839027722677, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1570468346277873, + "proposal_reconstruction": 0.059013870234290756, + "proposal_success": 0.6656323795517286, + "reocclusion": 0.2790042506530881, + "role_swap_consistency": 0.0005038505745081542, + "support_mode": 0.03911329966407114, + "support_stability": 0.1344400765374303, + "total": 1.2917357434829075, + "uncertainty": 0.003919239621609449, + "visibility": 0.10335852671414614, + "world_model": 2.016709173719088 + }, + "val": { + "action": 0.01665111506978671, + "arm_role": 0.00012317704871141663, + "belief": 0.11847328394651413, + "clearance": 0.07685465945137872, + "corridor": 0.23272691004806095, + "disturbance": 0.0017439340590499341, + "grasp_affordance": 0.009264905523094866, + "occluder_contact": 0.20612997810045877, + "persistence": 2.2046579784817166, + "phase": 0.45347891251246136, + "planner_ranking": 0.03614056089686023, + "planner_risk": 0.012666935566812754, + "planner_success": 0.5357781946659088, + "proposal_diversity": 0.0, + "proposal_ranking": 1.118505941496955, + "proposal_reconstruction": 0.059652416656414665, + "proposal_success": 0.6818766991297404, + "reocclusion": 0.31799929009543526, + "role_swap_consistency": 0.0, + "support_mode": 0.00013029564353119995, + "support_stability": 0.13687688443395826, + "total": 1.3739903701676264, + "uncertainty": 0.0021665632569541535, + "visibility": 0.09471688088443544, + "world_model": 2.233483672142029 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..884e7ea038c79d0c2384b31faacd08aba16f3fa3 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/summary.json @@ -0,0 +1,14 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage2_dummy_seed22", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed22/checkpoint_best.pt", + "final_train_total": 1.2917357434829075, + "final_val_total": 1.3739903701676264, + "train_time_sec": 21.751301288604736, + "peak_gpu_memory_mb": 635.970703125, + "num_train_samples": 380, + "num_val_samples": 132, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": null +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..98a7634dd43c1f7bea6523bbc55b13874987dc99 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.625 + }, + "mean_success": 0.5277777777777778, + "visibility_integral": 31.055846561988194, + "corridor_availability": 0.8294495956765281, + "reocclusion_rate": 0.036193347953216375, + "persistence_horizon_mae": 2.446918895718322, + "disturbance_cost": 0.2842518512883948 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..a8d42ad7626f90ef68c6867e28444af9f526a7d9 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/checkpoint_best.pt +- mean_success: 0.528 +- visibility_integral: 31.056 +- corridor_availability: 0.829 +- reocclusion_rate: 0.036 +- persistence_horizon_mae: 2.447 +- disturbance_cost: 0.284 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.625 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_no_world_model/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_no_world_model/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..b8975ffea8ff240a4211f7ad5a782d9b753c95a1 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_no_world_model/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.625 + }, + "mean_success": 0.5277777777777778, + "visibility_integral": 31.244046566387016, + "corridor_availability": 0.8636231190628476, + "reocclusion_rate": 0.00798611111111111, + "persistence_horizon_mae": 2.825085285899754, + "disturbance_cost": 0.3346485110103256 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_no_world_model/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_no_world_model/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..81759851de48d3bda9bf9a04a4c57b07773c56f7 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_no_world_model/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/checkpoint_best.pt +- mean_success: 0.528 +- visibility_integral: 31.244 +- corridor_availability: 0.864 +- reocclusion_rate: 0.008 +- persistence_horizon_mae: 2.825 +- disturbance_cost: 0.335 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.625 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_short_history/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_short_history/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..bc9c6b37c343109b256d01bc0bde6de0af74717c --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_short_history/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.625 + }, + "mean_success": 0.5277777777777778, + "visibility_integral": 31.054917756054138, + "corridor_availability": 0.8292781271868281, + "reocclusion_rate": 0.036366959064327485, + "persistence_horizon_mae": 2.4464666320020285, + "disturbance_cost": 0.2843864895920787 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_short_history/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_short_history/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..dcbbbe31aceba3f45f901e2d7277cde41a3f50ec --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/benchmark_short_history/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/checkpoint_best.pt +- mean_success: 0.528 +- visibility_integral: 31.055 +- corridor_availability: 0.829 +- reocclusion_rate: 0.036 +- persistence_horizon_mae: 2.446 +- disturbance_cost: 0.284 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.625 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..00c657eb23e26c38238200a9ac782819c1acebf3 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/config_resolved.yaml @@ -0,0 +1,148 @@ +experiment_name: proxy_interaction_r3d_stage2_dummy_seed23 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 23 +defaults: [] +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 96 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_v6_rgbd_stage2_dummy_seed23.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_v6_rgbd_stage2_dummy_seed23.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 23 +optim: + epochs: 4 + batch_size: 16 + num_workers: 4 + lr: 0.001 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: false + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: false + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 192 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: true + fusion: + hidden_dim: 192 + num_cameras: 3 + num_layers: 2 + num_heads: 4 + ff_dim: 384 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 192 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 4 + max_history_steps: 8 + decoder: + hidden_dim: 192 + num_heads: 4 + num_layers: 2 + ff_dim: 384 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 192 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 4 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 192 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 4 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 192 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 4 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.15 + arm_role: 0.2 + support_mode: 0.15 + corridor: 0.2 + persistence: 0.1 + disturbance: 0.1 + world_model: 0.3 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.2 + planner_risk: 0.1 + planner_ranking: 0.1 + proposal_reconstruction: 0.2 + proposal_success: 0.1 + proposal_ranking: 0.1 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..0693b4b7422520470edad09c32366527610a9ad6 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.3458646616541353, + "planner_regret": 0.020924845710396767, + "planner_score_utility_spearman": 0.22406017780303955, + "risk_calibration_mse": 0.01817331090569496, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.02499752677977085, + "left_right_equivariance_error": 0.0035538733252050247, + "belief_calibration_brier": 0.016437487676739693, + "reocclusion_calibration_brier": 0.2605345547199249, + "support_stability_mae": 0.03507188707590103, + "clearance_auc": 0.8892945983340073, + "memory_write_rate": 0.0, + "memory_saturation": 0.8572164177894592, + "num_samples": 133 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..0ea2ef14bdb65bd0980161234b4efb14fb39f52f --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.029319839163993795, + "arm_role": 0.23177419497793986, + "belief": 0.19956070557236671, + "clearance": 0.19428976656248173, + "corridor": 0.28412687219679356, + "disturbance": 0.014706775381152207, + "grasp_affordance": 0.09792078468793382, + "occluder_contact": 0.26536280413468677, + "persistence": 4.883942524592082, + "phase": 0.7541014266510805, + "planner_ranking": 0.6753277728954951, + "planner_risk": 0.025359969469718635, + "planner_success": 0.6392867142955462, + "proposal_diversity": 0.0, + "proposal_ranking": 1.2586021423339844, + "proposal_reconstruction": 0.07134231490393479, + "proposal_success": 0.6742175792654356, + "reocclusion": 0.6968964214126269, + "role_swap_consistency": 0.0008294641763010683, + "support_mode": 0.7254589063425859, + "support_stability": 0.20699986908584833, + "total": 2.557743047674497, + "uncertainty": 0.2125748023002719, + "visibility": 0.1725630493213733, + "world_model": 4.047068367401759 + }, + "val": { + "action": 0.02311015480922328, + "arm_role": 0.00018451267129017247, + "belief": 0.10730510370598899, + "clearance": 0.08969895541667938, + "corridor": 0.2666405571831597, + "disturbance": 0.0017999378841422084, + "grasp_affordance": 0.009250536198831268, + "occluder_contact": 0.2300689915815989, + "persistence": 4.78337902492947, + "phase": 0.7185595366689894, + "planner_ranking": 0.4548414415783352, + "planner_risk": 0.012839581610427963, + "planner_success": 0.6475298735830519, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1986345450083415, + "proposal_reconstruction": 0.06549014771978061, + "proposal_success": 0.6734013424979316, + "reocclusion": 0.6851721008618673, + "role_swap_consistency": 0.0, + "support_mode": 0.6492728524737887, + "support_stability": 0.13324941943089166, + "total": 2.1259667608473034, + "uncertainty": 0.007739724384413825, + "visibility": 0.09847861197259691, + "world_model": 3.0716149542066784 + } + }, + { + "epoch": 1, + "train": { + "action": 0.01952160553385814, + "arm_role": 8.932303330766445e-05, + "belief": 0.12918406135092178, + "clearance": 0.0892887885371844, + "corridor": 0.2474869458625714, + "disturbance": 0.0037682938176052025, + "grasp_affordance": 0.017499797123794753, + "occluder_contact": 0.21778892911970615, + "persistence": 4.783275107542674, + "phase": 0.49998418365915615, + "planner_ranking": 0.1362916425180932, + "planner_risk": 0.016128994757309556, + "planner_success": 0.5813094191253185, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1601267904043198, + "proposal_reconstruction": 0.06217473496993383, + "proposal_success": 0.6659636174639066, + "reocclusion": 0.42425032146275043, + "role_swap_consistency": 0.0004794289428294481, + "support_mode": 0.2768289456823065, + "support_stability": 0.1436432379608353, + "total": 1.7938196162382762, + "uncertainty": 0.03192775448163351, + "visibility": 0.11060131123910348, + "world_model": 2.4868411223093667 + }, + "val": { + "action": 0.020857719497548208, + "arm_role": 0.0003135378614792393, + "belief": 0.11252795242600971, + "clearance": 0.07942688961823781, + "corridor": 0.23491873840490976, + "disturbance": 0.002107284943728397, + "grasp_affordance": 0.012219702411029074, + "occluder_contact": 0.2054839183886846, + "persistence": 5.023713111877441, + "phase": 0.429247111082077, + "planner_ranking": 0.06150271536575423, + "planner_risk": 0.014784476098914942, + "planner_success": 0.5266371270020803, + "proposal_diversity": 0.0, + "proposal_ranking": 1.151251130633884, + "proposal_reconstruction": 0.06296455942922169, + "proposal_success": 0.6976126233736674, + "reocclusion": 0.31344303902652526, + "role_swap_consistency": 0.0, + "support_mode": 0.0009844449426357944, + "support_stability": 0.137757099337048, + "total": 1.758366995387607, + "uncertainty": 0.028360916922489803, + "visibility": 0.0932084388203091, + "world_model": 2.549745281537374 + } + }, + { + "epoch": 2, + "train": { + "action": 0.01677789391639332, + "arm_role": 0.00018648877752032908, + "belief": 0.10787749228378136, + "clearance": 0.08126144856214523, + "corridor": 0.2433396608879169, + "disturbance": 0.00258097746943046, + "grasp_affordance": 0.017474771360866725, + "occluder_contact": 0.20834970474243164, + "persistence": 2.438386077682177, + "phase": 0.4408506167431672, + "planner_ranking": 0.0513462177477777, + "planner_risk": 0.012917533827324709, + "planner_success": 0.5113103551169237, + "proposal_diversity": 0.0, + "proposal_ranking": 1.149764706691106, + "proposal_reconstruction": 0.06004528577129046, + "proposal_success": 0.675537995994091, + "reocclusion": 0.25245313874135417, + "role_swap_consistency": 0.000515319329376022, + "support_mode": 0.0019980755605502054, + "support_stability": 0.140461476209263, + "total": 1.388836865623792, + "uncertainty": 0.022540901283112664, + "visibility": 0.09868530587603648, + "world_model": 2.2134085396925607 + }, + "val": { + "action": 0.017413452060686216, + "arm_role": 6.871581151952139e-05, + "belief": 0.11101349939902623, + "clearance": 0.08136323259936439, + "corridor": 0.254426423046324, + "disturbance": 0.0028602277549604573, + "grasp_affordance": 0.010002103013296923, + "occluder_contact": 0.22746851212448543, + "persistence": 2.106385005844964, + "phase": 0.46620431542396545, + "planner_ranking": 0.037967391312122345, + "planner_risk": 0.01383865676406357, + "planner_success": 0.5223823752668169, + "proposal_diversity": 0.0, + "proposal_ranking": 1.120710015296936, + "proposal_reconstruction": 0.06028586791621314, + "proposal_success": 0.6775065395567152, + "reocclusion": 0.29894231177038616, + "role_swap_consistency": 0.0, + "support_mode": 0.00042318304379781085, + "support_stability": 0.14477568368117014, + "total": 1.4280590216318767, + "uncertainty": 0.015508349053561687, + "visibility": 0.10366267793708378, + "world_model": 2.4275851249694824 + } + }, + { + "epoch": 3, + "train": { + "action": 0.014798827391738692, + "arm_role": 0.00012019669429719215, + "belief": 0.10037506744265556, + "clearance": 0.0759961671816806, + "corridor": 0.21944596556325754, + "disturbance": 0.0022576948249479756, + "grasp_affordance": 0.008150271993751327, + "occluder_contact": 0.204491992170612, + "persistence": 1.7069302797317505, + "phase": 0.4352826727554202, + "planner_ranking": 0.039453314462055765, + "planner_risk": 0.01098932025100415, + "planner_success": 0.49703357741236687, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1383505016565323, + "proposal_reconstruction": 0.058284393356492124, + "proposal_success": 0.6598181600371996, + "reocclusion": 0.24928847063953677, + "role_swap_consistency": 0.00044218337643542327, + "support_mode": 0.00023409606789452178, + "support_stability": 0.13384470157325268, + "total": 1.2625050817926724, + "uncertainty": 0.006061152564749743, + "visibility": 0.0954263440022866, + "world_model": 2.093868618210157 + }, + "val": { + "action": 0.016459165140986443, + "arm_role": 0.00010593322319133829, + "belief": 0.11468188961346944, + "clearance": 0.07586023211479187, + "corridor": 0.2960966345336702, + "disturbance": 0.003753158315602276, + "grasp_affordance": 0.008578508730149932, + "occluder_contact": 0.2071819139851464, + "persistence": 2.1860306660334268, + "phase": 0.4479760792520311, + "planner_ranking": 0.03706499561667442, + "planner_risk": 0.013231952778167196, + "planner_success": 0.4939282072914971, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1495306624306574, + "proposal_reconstruction": 0.05932460932268037, + "proposal_success": 0.6537699633174472, + "reocclusion": 0.29562795327769387, + "role_swap_consistency": 0.0, + "support_mode": 0.00016798629722971882, + "support_stability": 0.13361257563034692, + "total": 1.392296102311876, + "uncertainty": 0.0055736687241329085, + "visibility": 0.09414981967873043, + "world_model": 2.2928188774320812 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..1b0ad63ab15cc90281b79ed45ca4356f79da49b9 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/summary.json @@ -0,0 +1,14 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage2_dummy_seed23", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage2_dummy_seed23/checkpoint_best.pt", + "final_train_total": 1.2625050817926724, + "final_val_total": 1.392296102311876, + "train_time_sec": 22.338274240493774, + "peak_gpu_memory_mb": 642.658203125, + "num_train_samples": 380, + "num_val_samples": 133, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": null +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..41c495ecb4cd2eac11e8e36c401ac0a8706dfef8 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.5, + "bag_proxy": 0.5833333333333334, + "cloth_proxy": 0.7083333333333334 + }, + "mean_success": 0.5972222222222223, + "visibility_integral": 17.476146274142796, + "corridor_availability": 0.6671382097734345, + "reocclusion_rate": 0.022222222222222223, + "persistence_horizon_mae": 2.4605967335703705, + "disturbance_cost": 0.15201607884632218 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..caebf0261f5ea2e2995704d66d0e76c2d17fd3b8 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt +- mean_success: 0.597 +- visibility_integral: 17.476 +- corridor_availability: 0.667 +- reocclusion_rate: 0.022 +- persistence_horizon_mae: 2.461 +- disturbance_cost: 0.152 +- foliage_proxy_success: 0.500 +- bag_proxy_success: 0.583 +- cloth_proxy_success: 0.708 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_no_depth/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_no_depth/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..fb1524ab69b165e66adf25b1d266ea74407f969f --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_no_depth/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4166666666666667, + "bag_proxy": 0.4583333333333333, + "cloth_proxy": 0.6666666666666666 + }, + "mean_success": 0.5138888888888888, + "visibility_integral": 17.8445434462693, + "corridor_availability": 0.6146270692762401, + "reocclusion_rate": 0.025173611111111115, + "persistence_horizon_mae": 2.6810189323804137, + "disturbance_cost": 0.1732833090548714 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_no_depth/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_no_depth/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..d19910714c310557e6c65c0fc19cac175e902df6 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/benchmark_no_depth/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt +- mean_success: 0.514 +- visibility_integral: 17.845 +- corridor_availability: 0.615 +- reocclusion_rate: 0.025 +- persistence_horizon_mae: 2.681 +- disturbance_cost: 0.173 +- foliage_proxy_success: 0.417 +- bag_proxy_success: 0.458 +- cloth_proxy_success: 0.667 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..79cd3d6e38e4b0a081d397b9d5130b5ca69707bc --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/config_resolved.yaml @@ -0,0 +1,149 @@ +experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_seed17 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 17 +init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt +init_strict: false +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 224 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_seed17.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_seed17.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 17 +optim: + epochs: 4 + batch_size: 2 + num_workers: 4 + lr: 0.0003 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: true + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: true + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 512 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: false + fusion: + hidden_dim: 512 + num_cameras: 3 + num_layers: 4 + num_heads: 8 + ff_dim: 2048 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 512 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 8 + max_history_steps: 8 + decoder: + hidden_dim: 512 + num_heads: 8 + num_layers: 4 + ff_dim: 2048 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 512 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 8 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 512 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 8 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 512 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 8 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.1 + arm_role: 0.15 + support_mode: 0.1 + corridor: 0.15 + persistence: 0.05 + disturbance: 0.05 + world_model: 0.25 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.25 + planner_risk: 0.1 + planner_ranking: 0.2 + proposal_reconstruction: 0.1 + proposal_success: 0.15 + proposal_ranking: 0.2 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..0fc231d5c506b0afee033d9b067a898a9721261e --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.29770992366412213, + "planner_regret": 0.013548726215958595, + "planner_score_utility_spearman": 0.19083969295024872, + "risk_calibration_mse": 0.010792036540806293, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.033339403569698334, + "left_right_equivariance_error": 6.996666387280901e-05, + "belief_calibration_brier": 0.004780409391969442, + "reocclusion_calibration_brier": 0.2683986723423004, + "support_stability_mae": 0.027458177879452705, + "clearance_auc": 0.9621755433404506, + "memory_write_rate": 0.4997691512107849, + "memory_saturation": 0.5063503980636597, + "num_samples": 131 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..86553cbe14abad380b398fb218f2be36df31dc72 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.02815176180416816, + "arm_role": 0.029603523643393265, + "belief": 0.11383584796598083, + "clearance": 0.09396348909327859, + "corridor": 0.27990493799902894, + "disturbance": 0.0038364405198463877, + "grasp_affordance": 0.021576231786687123, + "occluder_contact": 0.21160779281666403, + "persistence": 5.600160198503511, + "phase": 0.7736630088404606, + "planner_ranking": 0.22218675435226606, + "planner_risk": 0.016042623469394657, + "planner_success": 0.6215873261815623, + "proposal_diversity": 0.0, + "proposal_ranking": 1.2711315641277714, + "proposal_reconstruction": 0.06910703382209728, + "proposal_success": 0.6854824983759931, + "reocclusion": 0.7048384636640549, + "role_swap_consistency": 0.0009499136090437931, + "support_mode": 0.7124486019736842, + "support_stability": 0.16308180080040505, + "total": 1.80654469226536, + "uncertainty": 0.018382124003638686, + "visibility": 0.11109156565446603, + "world_model": 2.6661070312324324 + }, + "val": { + "action": 0.02329161667236776, + "arm_role": 1.2488904289828938e-05, + "belief": 0.09460798730001305, + "clearance": 0.07680663082635764, + "corridor": 0.23859078346779852, + "disturbance": 0.0018481058845778039, + "grasp_affordance": 0.008316120937127958, + "occluder_contact": 0.20198668494369043, + "persistence": 4.336097291021636, + "phase": 0.6960404786196622, + "planner_ranking": 0.05048516956263815, + "planner_risk": 0.011283920221268834, + "planner_success": 0.6002033824721972, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1648338744134614, + "proposal_reconstruction": 0.06488600407134403, + "proposal_success": 0.6840101480484009, + "reocclusion": 0.6917306789846132, + "role_swap_consistency": 0.0, + "support_mode": 0.6627979712052778, + "support_stability": 0.14844087350436233, + "total": 1.679735080762343, + "uncertainty": 0.0017192336152110136, + "visibility": 0.08843542991036718, + "world_model": 2.796021072250424 + } + }, + { + "epoch": 1, + "train": { + "action": 0.022217433576128983, + "arm_role": 1.2244989997462223e-05, + "belief": 0.10026986102916692, + "clearance": 0.08305128944155417, + "corridor": 0.2513870910986474, + "disturbance": 0.002561395750442324, + "grasp_affordance": 0.009610651308474572, + "occluder_contact": 0.2029949708988792, + "persistence": 4.274056613445282, + "phase": 0.6910824424342106, + "planner_ranking": 0.05793145077140683, + "planner_risk": 0.01096556101493709, + "planner_success": 0.5687428168560329, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1443857858055517, + "proposal_reconstruction": 0.06427661131479238, + "proposal_success": 0.677754524193312, + "reocclusion": 0.7005640453413913, + "role_swap_consistency": 0.000685913089546375, + "support_mode": 0.6932514391447369, + "support_stability": 0.1438921767728109, + "total": 1.5099341706225746, + "uncertainty": 0.0017574461604358237, + "visibility": 0.09719569808558413, + "world_model": 2.1559007089389 + }, + "val": { + "action": 0.022725384470752695, + "arm_role": 5.272072281293345e-05, + "belief": 0.09615642464522159, + "clearance": 0.08045592444073973, + "corridor": 0.23664624672947507, + "disturbance": 0.0026030399920714308, + "grasp_affordance": 0.008498197915314726, + "occluder_contact": 0.20824123399727273, + "persistence": 4.317030298890489, + "phase": 0.6911809751481721, + "planner_ranking": 0.04295492070951062, + "planner_risk": 0.011065340517124048, + "planner_success": 0.48531281767469464, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1322515372074011, + "proposal_reconstruction": 0.06503733078187163, + "proposal_success": 0.6843914163835121, + "reocclusion": 0.6961798333760464, + "role_swap_consistency": 0.0, + "support_mode": 0.6560130250273328, + "support_stability": 0.15669836300058346, + "total": 1.5237527164545925, + "uncertainty": 0.00022470286212134823, + "visibility": 0.09155982241711834, + "world_model": 2.325164812080788 + } + }, + { + "epoch": 2, + "train": { + "action": 0.021325080562382935, + "arm_role": 3.0533263557835627e-06, + "belief": 0.10637223070585414, + "clearance": 0.08482141373188871, + "corridor": 0.24448425138467236, + "disturbance": 0.0027837532476089684, + "grasp_affordance": 0.010188110915355776, + "occluder_contact": 0.20852525508717487, + "persistence": 3.92157253560267, + "phase": 0.6793996710526315, + "planner_ranking": 0.04173064419405579, + "planner_risk": 0.01095533966547105, + "planner_success": 0.5252253392809316, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1396061564746656, + "proposal_reconstruction": 0.06338561845845297, + "proposal_success": 0.6762565412019429, + "reocclusion": 0.6968451614442625, + "role_swap_consistency": 0.00035725896036877344, + "support_mode": 0.67568359375, + "support_stability": 0.1382079716869875, + "total": 1.4412984270798532, + "uncertainty": 0.0013268421901700908, + "visibility": 0.10235781155918774, + "world_model": 2.0309231833407755 + }, + "val": { + "action": 0.02207596722820943, + "arm_role": 5.734688214130129e-07, + "belief": 0.10629133389077404, + "clearance": 0.08537082306363365, + "corridor": 0.2549011127063722, + "disturbance": 0.0016559935975788958, + "grasp_affordance": 0.009338989832692525, + "occluder_contact": 0.22646727706446793, + "persistence": 3.3828756935668713, + "phase": 0.6586590924046256, + "planner_ranking": 0.037367004423382845, + "planner_risk": 0.010625976211017449, + "planner_success": 0.4890047986850594, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1231766451488843, + "proposal_reconstruction": 0.06420681360318806, + "proposal_success": 0.686759861129703, + "reocclusion": 0.6883266523028865, + "role_swap_consistency": 0.0, + "support_mode": 0.6401153613220562, + "support_stability": 0.1437410551095099, + "total": 1.4105993129990317, + "uncertainty": 0.0012141159448109454, + "visibility": 0.09800120776124073, + "world_model": 2.073751407139229 + } + }, + { + "epoch": 3, + "train": { + "action": 0.01997323781742077, + "arm_role": 6.544025320755808e-05, + "belief": 0.1165526967299612, + "clearance": 0.08391809918378529, + "corridor": 0.22205955819808879, + "disturbance": 0.0020432069609006777, + "grasp_affordance": 0.010270981588645985, + "occluder_contact": 0.2182939759994808, + "persistence": 1.2004593090216034, + "phase": 0.48546034160413243, + "planner_ranking": 0.034699498282608196, + "planner_risk": 0.010634258209029213, + "planner_success": 0.5031348044150754, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1341818285615821, + "proposal_reconstruction": 0.06254415310135013, + "proposal_success": 0.6701703595487695, + "reocclusion": 0.4238589314466624, + "role_swap_consistency": 0.0005717776034879008, + "support_mode": 0.25150866257516963, + "support_stability": 0.1331169359729086, + "total": 1.194669181735892, + "uncertainty": 0.0006598267668334139, + "visibility": 0.11416163087675446, + "world_model": 1.94043176832952 + }, + "val": { + "action": 0.021236835851926695, + "arm_role": 3.174391024734205e-06, + "belief": 0.11499111557548697, + "clearance": 0.079068739305843, + "corridor": 0.21379030992587408, + "disturbance": 0.0011454115509443661, + "grasp_affordance": 0.01121496031004371, + "occluder_contact": 0.21270102081876813, + "persistence": 1.763856044095574, + "phase": 0.3144524506095684, + "planner_ranking": 0.04632020881961745, + "planner_risk": 0.010791946026570671, + "planner_success": 0.4698902251142444, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1357553402582805, + "proposal_reconstruction": 0.06354159370742062, + "proposal_success": 0.6496003658482523, + "reocclusion": 0.31966371032776253, + "role_swap_consistency": 0.0, + "support_mode": 0.0014491713399448517, + "support_stability": 0.13909960007577232, + "total": 1.1584763833970735, + "uncertainty": 0.00029339295746688026, + "visibility": 0.09549486710492408, + "world_model": 1.911818307457548 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/summary.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/summary.json new file mode 100644 index 0000000000000000000000000000000000000000..050b14c6daecee35aaf514f8058d588b84563b26 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/summary.json @@ -0,0 +1,557 @@ +{ + "experiment_name": "proxy_interaction_r3d_stage3_clip_rgbd_seed17", + "device": "cuda", + "best_checkpoint": "/workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed17/checkpoint_best.pt", + "final_train_total": 1.194669181735892, + "final_val_total": 1.1584763833970735, + "train_time_sec": 138.50738143920898, + "peak_gpu_memory_mb": 1933.771484375, + "num_train_samples": 380, + "num_val_samples": 131, + "planner_mode": "trainable", + "frozen_modules": [], + "init_info": { + "path": "/workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt", + "loaded_keys": 461, + "skipped_shape_mismatch_keys": [ + "memory.gru.weight_ih_l0", + "memory.gru.weight_hh_l0", + "memory.gru.bias_ih_l0", + "memory.gru.bias_hh_l0", + "memory.token_proj.0.weight", + "memory.token_proj.0.bias", + "memory.token_proj.1.weight", + "memory.token_proj.1.bias", + "decoder.actor_role_bias", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.0.linear1.weight", + "decoder.revealer_decoder.layers.0.linear1.bias", + "decoder.revealer_decoder.layers.0.linear2.weight", + "decoder.revealer_decoder.layers.0.linear2.bias", + "decoder.revealer_decoder.layers.0.norm1.weight", + "decoder.revealer_decoder.layers.0.norm1.bias", + "decoder.revealer_decoder.layers.0.norm2.weight", + "decoder.revealer_decoder.layers.0.norm2.bias", + "decoder.revealer_decoder.layers.0.norm3.weight", + "decoder.revealer_decoder.layers.0.norm3.bias", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.1.linear1.weight", + "decoder.revealer_decoder.layers.1.linear1.bias", + "decoder.revealer_decoder.layers.1.linear2.weight", + "decoder.revealer_decoder.layers.1.linear2.bias", + "decoder.revealer_decoder.layers.1.norm1.weight", + "decoder.revealer_decoder.layers.1.norm1.bias", + "decoder.revealer_decoder.layers.1.norm2.weight", + "decoder.revealer_decoder.layers.1.norm2.bias", + "decoder.revealer_decoder.layers.1.norm3.weight", + "decoder.revealer_decoder.layers.1.norm3.bias", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.2.linear1.weight", + "decoder.revealer_decoder.layers.2.linear1.bias", + "decoder.revealer_decoder.layers.2.linear2.weight", + "decoder.revealer_decoder.layers.2.linear2.bias", + "decoder.revealer_decoder.layers.2.norm1.weight", + "decoder.revealer_decoder.layers.2.norm1.bias", + "decoder.revealer_decoder.layers.2.norm2.weight", + "decoder.revealer_decoder.layers.2.norm2.bias", + "decoder.revealer_decoder.layers.2.norm3.weight", + "decoder.revealer_decoder.layers.2.norm3.bias", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.self_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.self_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.revealer_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.revealer_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.revealer_decoder.layers.3.linear1.weight", + "decoder.revealer_decoder.layers.3.linear1.bias", + "decoder.revealer_decoder.layers.3.linear2.weight", + "decoder.revealer_decoder.layers.3.linear2.bias", + "decoder.revealer_decoder.layers.3.norm1.weight", + "decoder.revealer_decoder.layers.3.norm1.bias", + "decoder.revealer_decoder.layers.3.norm2.weight", + "decoder.revealer_decoder.layers.3.norm2.bias", + "decoder.revealer_decoder.layers.3.norm3.weight", + "decoder.revealer_decoder.layers.3.norm3.bias", + "decoder.actor_decoder.layers.0.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.0.linear1.weight", + "decoder.actor_decoder.layers.0.linear1.bias", + "decoder.actor_decoder.layers.0.linear2.weight", + "decoder.actor_decoder.layers.0.linear2.bias", + "decoder.actor_decoder.layers.0.norm1.weight", + "decoder.actor_decoder.layers.0.norm1.bias", + "decoder.actor_decoder.layers.0.norm2.weight", + "decoder.actor_decoder.layers.0.norm2.bias", + "decoder.actor_decoder.layers.0.norm3.weight", + "decoder.actor_decoder.layers.0.norm3.bias", + "decoder.actor_decoder.layers.1.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.1.linear1.weight", + "decoder.actor_decoder.layers.1.linear1.bias", + "decoder.actor_decoder.layers.1.linear2.weight", + "decoder.actor_decoder.layers.1.linear2.bias", + "decoder.actor_decoder.layers.1.norm1.weight", + "decoder.actor_decoder.layers.1.norm1.bias", + "decoder.actor_decoder.layers.1.norm2.weight", + "decoder.actor_decoder.layers.1.norm2.bias", + "decoder.actor_decoder.layers.1.norm3.weight", + "decoder.actor_decoder.layers.1.norm3.bias", + "decoder.actor_decoder.layers.2.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.2.linear1.weight", + "decoder.actor_decoder.layers.2.linear1.bias", + "decoder.actor_decoder.layers.2.linear2.weight", + "decoder.actor_decoder.layers.2.linear2.bias", + "decoder.actor_decoder.layers.2.norm1.weight", + "decoder.actor_decoder.layers.2.norm1.bias", + "decoder.actor_decoder.layers.2.norm2.weight", + "decoder.actor_decoder.layers.2.norm2.bias", + "decoder.actor_decoder.layers.2.norm3.weight", + "decoder.actor_decoder.layers.2.norm3.bias", + "decoder.actor_decoder.layers.3.self_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.self_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.self_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.self_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.actor_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.actor_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.actor_decoder.layers.3.linear1.weight", + "decoder.actor_decoder.layers.3.linear1.bias", + "decoder.actor_decoder.layers.3.linear2.weight", + "decoder.actor_decoder.layers.3.linear2.bias", + "decoder.actor_decoder.layers.3.norm1.weight", + "decoder.actor_decoder.layers.3.norm1.bias", + "decoder.actor_decoder.layers.3.norm2.weight", + "decoder.actor_decoder.layers.3.norm2.bias", + "decoder.actor_decoder.layers.3.norm3.weight", + "decoder.actor_decoder.layers.3.norm3.bias", + "decoder.revealer_mean.weight", + "decoder.revealer_mean.bias", + "decoder.revealer_log_std.weight", + "decoder.revealer_log_std.bias", + "decoder.actor_mean.weight", + "decoder.actor_mean.bias", + "decoder.actor_log_std.weight", + "decoder.actor_log_std.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias" + ], + "missing_keys": [ + "backbone.depth_adapter.depth_proj.0.weight", + "backbone.depth_adapter.depth_proj.0.bias", + "backbone.depth_adapter.depth_proj.1.weight", + "backbone.depth_adapter.depth_proj.1.bias", + "backbone.depth_adapter.depth_proj.3.weight", + "backbone.depth_adapter.depth_proj.3.bias", + "backbone.depth_adapter.geometry_proj.0.weight", + "backbone.depth_adapter.geometry_proj.0.bias", + "backbone.depth_adapter.geometry_proj.1.weight", + "backbone.depth_adapter.geometry_proj.1.bias", + "backbone.depth_adapter.camera_proj.0.weight", + "backbone.depth_adapter.camera_proj.0.bias", + "backbone.depth_adapter.camera_proj.1.weight", + "backbone.depth_adapter.camera_proj.1.bias", + "fusion.geometry_fusion.attn.in_proj_weight", + "fusion.geometry_fusion.attn.in_proj_bias", + "fusion.geometry_fusion.attn.out_proj.weight", + "fusion.geometry_fusion.attn.out_proj.bias", + "fusion.geometry_fusion.gate.0.weight", + "fusion.geometry_fusion.gate.0.bias", + "fusion.geometry_fusion.gate.1.weight", + "fusion.geometry_fusion.gate.1.bias", + "fusion.geometry_fusion.gate.3.weight", + "fusion.geometry_fusion.gate.3.bias", + "fusion.geometry_fusion.out.0.weight", + "fusion.geometry_fusion.out.0.bias", + "fusion.geometry_fusion.out.1.weight", + "fusion.geometry_fusion.out.1.bias", + "memory.scene_memory.position_embedding", + "memory.scene_memory.bank_queries", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.scene_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear1.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear1.bias", + "memory.scene_memory.sequence_encoder.layers.0.linear2.weight", + "memory.scene_memory.sequence_encoder.layers.0.linear2.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm1.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm1.bias", + "memory.scene_memory.sequence_encoder.layers.0.norm2.weight", + "memory.scene_memory.sequence_encoder.layers.0.norm2.bias", + "memory.scene_memory.bank_attention.in_proj_weight", + "memory.scene_memory.bank_attention.in_proj_bias", + "memory.scene_memory.bank_attention.out_proj.weight", + "memory.scene_memory.bank_attention.out_proj.bias", + "memory.scene_memory.action_proj.0.weight", + "memory.scene_memory.action_proj.0.bias", + "memory.scene_memory.action_proj.1.weight", + "memory.scene_memory.action_proj.1.bias", + "memory.scene_memory.write_gate.0.weight", + "memory.scene_memory.write_gate.0.bias", + "memory.scene_memory.write_gate.1.weight", + "memory.scene_memory.write_gate.1.bias", + "memory.scene_memory.write_gate.3.weight", + "memory.scene_memory.write_gate.3.bias", + "memory.scene_memory.token_proj.0.weight", + "memory.scene_memory.token_proj.0.bias", + "memory.scene_memory.token_proj.1.weight", + "memory.scene_memory.token_proj.1.bias", + "memory.belief_memory.position_embedding", + "memory.belief_memory.bank_queries", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.in_proj_bias", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.weight", + "memory.belief_memory.sequence_encoder.layers.0.self_attn.out_proj.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear1.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear1.bias", + "memory.belief_memory.sequence_encoder.layers.0.linear2.weight", + "memory.belief_memory.sequence_encoder.layers.0.linear2.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm1.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm1.bias", + "memory.belief_memory.sequence_encoder.layers.0.norm2.weight", + "memory.belief_memory.sequence_encoder.layers.0.norm2.bias", + "memory.belief_memory.bank_attention.in_proj_weight", + "memory.belief_memory.bank_attention.in_proj_bias", + "memory.belief_memory.bank_attention.out_proj.weight", + "memory.belief_memory.bank_attention.out_proj.bias", + "memory.belief_memory.action_proj.0.weight", + "memory.belief_memory.action_proj.0.bias", + "memory.belief_memory.action_proj.1.weight", + "memory.belief_memory.action_proj.1.bias", + "memory.belief_memory.write_gate.0.weight", + "memory.belief_memory.write_gate.0.bias", + "memory.belief_memory.write_gate.1.weight", + "memory.belief_memory.write_gate.1.bias", + "memory.belief_memory.write_gate.3.weight", + "memory.belief_memory.write_gate.3.bias", + "memory.belief_memory.token_proj.0.weight", + "memory.belief_memory.token_proj.0.bias", + "memory.belief_memory.token_proj.1.weight", + "memory.belief_memory.token_proj.1.bias", + "decoder.arm_decoder.layers.0.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.0.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.0.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.0.linear1.weight", + "decoder.arm_decoder.layers.0.linear1.bias", + "decoder.arm_decoder.layers.0.linear2.weight", + "decoder.arm_decoder.layers.0.linear2.bias", + "decoder.arm_decoder.layers.0.norm1.weight", + "decoder.arm_decoder.layers.0.norm1.bias", + "decoder.arm_decoder.layers.0.norm2.weight", + "decoder.arm_decoder.layers.0.norm2.bias", + "decoder.arm_decoder.layers.0.norm3.weight", + "decoder.arm_decoder.layers.0.norm3.bias", + "decoder.arm_decoder.layers.1.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.1.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.1.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.1.linear1.weight", + "decoder.arm_decoder.layers.1.linear1.bias", + "decoder.arm_decoder.layers.1.linear2.weight", + "decoder.arm_decoder.layers.1.linear2.bias", + "decoder.arm_decoder.layers.1.norm1.weight", + "decoder.arm_decoder.layers.1.norm1.bias", + "decoder.arm_decoder.layers.1.norm2.weight", + "decoder.arm_decoder.layers.1.norm2.bias", + "decoder.arm_decoder.layers.1.norm3.weight", + "decoder.arm_decoder.layers.1.norm3.bias", + "decoder.arm_decoder.layers.2.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.2.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.2.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.2.linear1.weight", + "decoder.arm_decoder.layers.2.linear1.bias", + "decoder.arm_decoder.layers.2.linear2.weight", + "decoder.arm_decoder.layers.2.linear2.bias", + "decoder.arm_decoder.layers.2.norm1.weight", + "decoder.arm_decoder.layers.2.norm1.bias", + "decoder.arm_decoder.layers.2.norm2.weight", + "decoder.arm_decoder.layers.2.norm2.bias", + "decoder.arm_decoder.layers.2.norm3.weight", + "decoder.arm_decoder.layers.2.norm3.bias", + "decoder.arm_decoder.layers.3.self_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.self_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.self_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.self_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_weight", + "decoder.arm_decoder.layers.3.multihead_attn.in_proj_bias", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.weight", + "decoder.arm_decoder.layers.3.multihead_attn.out_proj.bias", + "decoder.arm_decoder.layers.3.linear1.weight", + "decoder.arm_decoder.layers.3.linear1.bias", + "decoder.arm_decoder.layers.3.linear2.weight", + "decoder.arm_decoder.layers.3.linear2.bias", + "decoder.arm_decoder.layers.3.norm1.weight", + "decoder.arm_decoder.layers.3.norm1.bias", + "decoder.arm_decoder.layers.3.norm2.weight", + "decoder.arm_decoder.layers.3.norm2.bias", + "decoder.arm_decoder.layers.3.norm3.weight", + "decoder.arm_decoder.layers.3.norm3.bias", + "decoder.arm_identity.weight", + "decoder.phase_adapter.weight", + "decoder.phase_adapter.bias", + "decoder.role_adapter.weight", + "decoder.role_adapter.bias", + "decoder.context_proj.0.weight", + "decoder.context_proj.0.bias", + "decoder.context_proj.1.weight", + "decoder.context_proj.1.bias", + "decoder.arm_head.0.weight", + "decoder.arm_head.0.bias", + "decoder.arm_head.1.weight", + "decoder.arm_head.1.bias", + "decoder.arm_mean.weight", + "decoder.arm_mean.bias", + "decoder.arm_log_std.weight", + "decoder.arm_log_std.bias", + "decoder.proposal_mode_head.0.weight", + "decoder.proposal_mode_head.0.bias", + "decoder.proposal_mode_head.1.weight", + "decoder.proposal_mode_head.1.bias", + "decoder.proposal_mode_head.3.weight", + "decoder.proposal_mode_head.3.bias", + "decoder.proposal_mode_embeddings.weight", + "decoder.proposal_slot_embeddings.weight", + "decoder.mode_residual_heads.0.0.weight", + "decoder.mode_residual_heads.0.0.bias", + "decoder.mode_residual_heads.0.1.weight", + "decoder.mode_residual_heads.0.1.bias", + "decoder.mode_residual_heads.0.3.weight", + "decoder.mode_residual_heads.0.3.bias", + "decoder.mode_residual_heads.1.0.weight", + "decoder.mode_residual_heads.1.0.bias", + "decoder.mode_residual_heads.1.1.weight", + "decoder.mode_residual_heads.1.1.bias", + "decoder.mode_residual_heads.1.3.weight", + "decoder.mode_residual_heads.1.3.bias", + "decoder.mode_residual_heads.2.0.weight", + "decoder.mode_residual_heads.2.0.bias", + "decoder.mode_residual_heads.2.1.weight", + "decoder.mode_residual_heads.2.1.bias", + "decoder.mode_residual_heads.2.3.weight", + "decoder.mode_residual_heads.2.3.bias", + "decoder.mode_residual_heads.3.0.weight", + "decoder.mode_residual_heads.3.0.bias", + "decoder.mode_residual_heads.3.1.weight", + "decoder.mode_residual_heads.3.1.bias", + "decoder.mode_residual_heads.3.3.weight", + "decoder.mode_residual_heads.3.3.bias", + "decoder.mode_residual_heads.4.0.weight", + "decoder.mode_residual_heads.4.0.bias", + "decoder.mode_residual_heads.4.1.weight", + "decoder.mode_residual_heads.4.1.bias", + "decoder.mode_residual_heads.4.3.weight", + "decoder.mode_residual_heads.4.3.bias", + "decoder.mode_residual_heads.5.0.weight", + "decoder.mode_residual_heads.5.0.bias", + "decoder.mode_residual_heads.5.1.weight", + "decoder.mode_residual_heads.5.1.bias", + "decoder.mode_residual_heads.5.3.weight", + "decoder.mode_residual_heads.5.3.bias", + "decoder.slot_delta.0.weight", + "decoder.slot_delta.0.bias", + "decoder.slot_delta.1.weight", + "decoder.slot_delta.1.bias", + "decoder.slot_delta.3.weight", + "decoder.slot_delta.3.bias", + "decoder.proposal_score.0.weight", + "decoder.proposal_score.0.bias", + "decoder.proposal_score.1.weight", + "decoder.proposal_score.1.bias", + "decoder.proposal_score.3.weight", + "decoder.proposal_score.3.bias", + "elastic_state_head.interaction_queries", + "elastic_state_head.interaction_attention.in_proj_weight", + "elastic_state_head.interaction_attention.in_proj_bias", + "elastic_state_head.interaction_attention.out_proj.weight", + "elastic_state_head.interaction_attention.out_proj.bias", + "elastic_state_head.interaction_mlp.0.weight", + "elastic_state_head.interaction_mlp.0.bias", + "elastic_state_head.interaction_mlp.1.weight", + "elastic_state_head.interaction_mlp.1.bias", + "elastic_state_head.interaction_mlp.3.weight", + "elastic_state_head.interaction_mlp.3.bias", + "elastic_state_head.decoder.field_queries", + "elastic_state_head.decoder.field_attention.in_proj_weight", + "elastic_state_head.decoder.field_attention.in_proj_bias", + "elastic_state_head.decoder.field_attention.out_proj.weight", + "elastic_state_head.decoder.field_attention.out_proj.bias", + "elastic_state_head.decoder.field_mlp.0.weight", + "elastic_state_head.decoder.field_mlp.0.bias", + "elastic_state_head.decoder.field_mlp.1.weight", + "elastic_state_head.decoder.field_mlp.1.bias", + "elastic_state_head.decoder.field_mlp.3.weight", + "elastic_state_head.decoder.field_mlp.3.bias", + "elastic_state_head.decoder.summary_proj.0.weight", + "elastic_state_head.decoder.summary_proj.0.bias", + "elastic_state_head.decoder.summary_proj.1.weight", + "elastic_state_head.decoder.summary_proj.1.bias", + "elastic_state_head.decoder.phase_head.0.weight", + "elastic_state_head.decoder.phase_head.0.bias", + "elastic_state_head.decoder.phase_head.1.weight", + "elastic_state_head.decoder.phase_head.1.bias", + "elastic_state_head.decoder.phase_head.3.weight", + "elastic_state_head.decoder.phase_head.3.bias", + "elastic_state_head.decoder.arm_role_head.0.weight", + "elastic_state_head.decoder.arm_role_head.0.bias", + "elastic_state_head.decoder.arm_role_head.1.weight", + "elastic_state_head.decoder.arm_role_head.1.bias", + "elastic_state_head.decoder.arm_role_head.3.weight", + "elastic_state_head.decoder.arm_role_head.3.bias", + "elastic_state_head.decoder.arm_identity.weight", + "elastic_state_head.decoder.support_mode.0.weight", + "elastic_state_head.decoder.support_mode.0.bias", + "elastic_state_head.decoder.support_mode.1.weight", + "elastic_state_head.decoder.support_mode.1.bias", + "elastic_state_head.decoder.support_mode.3.weight", + "elastic_state_head.decoder.support_mode.3.bias", + "elastic_state_head.decoder.access_field.weight", + "elastic_state_head.decoder.access_field.bias", + "elastic_state_head.decoder.target_belief_field.weight", + "elastic_state_head.decoder.target_belief_field.bias", + "elastic_state_head.decoder.visibility_field.weight", + "elastic_state_head.decoder.visibility_field.bias", + "elastic_state_head.decoder.clearance_field.weight", + "elastic_state_head.decoder.clearance_field.bias", + "elastic_state_head.decoder.occluder_contact_field.weight", + "elastic_state_head.decoder.occluder_contact_field.bias", + "elastic_state_head.decoder.grasp_affordance_field.weight", + "elastic_state_head.decoder.grasp_affordance_field.bias", + "elastic_state_head.decoder.support_stability_field.weight", + "elastic_state_head.decoder.support_stability_field.bias", + "elastic_state_head.decoder.persistence_field.weight", + "elastic_state_head.decoder.persistence_field.bias", + "elastic_state_head.decoder.reocclusion_field.weight", + "elastic_state_head.decoder.reocclusion_field.bias", + "elastic_state_head.decoder.disturbance_field.weight", + "elastic_state_head.decoder.disturbance_field.bias", + "elastic_state_head.decoder.uncertainty_field.weight", + "elastic_state_head.decoder.uncertainty_field.bias", + "elastic_state_head.decoder.reocclusion_head.0.weight", + "elastic_state_head.decoder.reocclusion_head.0.bias", + "elastic_state_head.decoder.reocclusion_head.1.weight", + "elastic_state_head.decoder.reocclusion_head.1.bias", + "elastic_state_head.decoder.reocclusion_head.3.weight", + "elastic_state_head.decoder.reocclusion_head.3.bias", + "world_model.state_encoder.0.weight", + "world_model.state_encoder.0.bias", + "world_model.state_encoder.1.weight", + "world_model.state_encoder.1.bias", + "world_model.scene_memory_proj.0.weight", + "world_model.scene_memory_proj.0.bias", + "world_model.scene_memory_proj.1.weight", + "world_model.scene_memory_proj.1.bias", + "world_model.belief_memory_proj.0.weight", + "world_model.belief_memory_proj.0.bias", + "world_model.belief_memory_proj.1.weight", + "world_model.belief_memory_proj.1.bias", + "world_model.action_encoder.0.weight", + "world_model.action_encoder.0.bias", + "world_model.action_encoder.1.weight", + "world_model.action_encoder.1.bias", + "world_model.transition.weight_ih", + "world_model.transition.weight_hh", + "world_model.transition.bias_ih", + "world_model.transition.bias_hh", + "world_model.scene_memory_update.weight", + "world_model.scene_memory_update.bias", + "world_model.belief_memory_update.weight", + "world_model.belief_memory_update.bias", + "world_model.compact_decoder.weight", + "world_model.compact_decoder.bias", + "world_model.target_belief_head.weight", + "world_model.target_belief_head.bias", + "world_model.visibility_head.weight", + "world_model.visibility_head.bias", + "world_model.clearance_head.weight", + "world_model.clearance_head.bias", + "world_model.occluder_contact_head.weight", + "world_model.occluder_contact_head.bias", + "world_model.grasp_affordance_head.weight", + "world_model.grasp_affordance_head.bias", + "world_model.support_stability_head.weight", + "world_model.support_stability_head.bias", + "world_model.persistence_head.weight", + "world_model.persistence_head.bias", + "world_model.reocclusion_head.weight", + "world_model.reocclusion_head.bias", + "world_model.disturbance_head.weight", + "world_model.disturbance_head.bias", + "world_model.uncertainty_head.weight", + "world_model.uncertainty_head.bias", + "world_model.access_head.weight", + "world_model.access_head.bias", + "planner.residual.trunk.0.weight", + "planner.residual.trunk.0.bias", + "planner.residual.trunk.1.weight", + "planner.residual.trunk.1.bias", + "planner.residual.trunk.3.weight", + "planner.residual.trunk.3.bias", + "planner.residual.success_head.weight", + "planner.residual.success_head.bias", + "planner.residual.risk_head.weight", + "planner.residual.risk_head.bias", + "planner.residual.residual_head.weight", + "planner.residual.residual_head.bias" + ], + "unexpected_keys": [] + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_full/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_full/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..4f97be52baa009b311fe02a1f75ca1a35ce4bd1d --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_full/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.5, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.7083333333333334 + }, + "mean_success": 0.5833333333333334, + "visibility_integral": 32.27142822328541, + "corridor_availability": 0.882228939069642, + "reocclusion_rate": 0.0, + "persistence_horizon_mae": 2.1414308214317197, + "disturbance_cost": 0.3078485221550282 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_full/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_full/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..62fbc289c590765dbb96c23298be04df438e5364 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_full/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/checkpoint_best.pt +- mean_success: 0.583 +- visibility_integral: 32.271 +- corridor_availability: 0.882 +- reocclusion_rate: 0.000 +- persistence_horizon_mae: 2.141 +- disturbance_cost: 0.308 +- foliage_proxy_success: 0.500 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.708 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_no_depth/reveal_benchmark.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_no_depth/reveal_benchmark.json new file mode 100644 index 0000000000000000000000000000000000000000..e1233443a30b93ddf34d7edeacd8420a5bfc4934 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_no_depth/reveal_benchmark.json @@ -0,0 +1,15 @@ +{ + "full": { + "per_task_success": { + "foliage_proxy": 0.4583333333333333, + "bag_proxy": 0.5416666666666666, + "cloth_proxy": 0.7083333333333334 + }, + "mean_success": 0.5694444444444445, + "visibility_integral": 31.285325296223164, + "corridor_availability": 0.8095407477683492, + "reocclusion_rate": 0.013385315139701101, + "persistence_horizon_mae": 2.936828779474813, + "disturbance_cost": 0.3132026158790622 + } +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_no_depth/reveal_benchmark.md b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_no_depth/reveal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..89f8f4c2eced501a98daf0e36f47cfdc54aea19f --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/benchmark_no_depth/reveal_benchmark.md @@ -0,0 +1,13 @@ +# Reveal Proxy Benchmark + +## full +- checkpoint: /workspace/VLAarchtests/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/checkpoint_best.pt +- mean_success: 0.569 +- visibility_integral: 31.285 +- corridor_availability: 0.810 +- reocclusion_rate: 0.013 +- persistence_horizon_mae: 2.937 +- disturbance_cost: 0.313 +- foliage_proxy_success: 0.458 +- bag_proxy_success: 0.542 +- cloth_proxy_success: 0.708 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/config_resolved.yaml b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/config_resolved.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f95b269273891af66f01204fdc3d8d102945fa27 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/config_resolved.yaml @@ -0,0 +1,149 @@ +experiment_name: proxy_interaction_r3d_stage3_clip_rgbd_seed18 +output_dir: /workspace/VLAarchtests/artifacts/outputs/r3d +device: cuda +seed: 18 +init_checkpoint: /workspace/VLAarchtests/artifacts/outputs/reveal_runs/proxy_backbone_only_clip/checkpoint_best.pt +init_strict: false +data: + proxies: + - foliage_proxy + - bag_proxy + - cloth_proxy + resolution: 224 + dataset_version: reveal_proxy_v6_rgbd_elastic_state + train_episodes_per_proxy: 48 + val_episodes_per_proxy: 16 + train_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_train_clip224_v6_rgbd_stage3_seed18.pt + val_dataset_path: /workspace/VLAarchtests/artifacts/data/reveal_proxy/proxy_val_clip224_v6_rgbd_stage3_seed18.pt + rebuild_dataset: false + chunk_horizon: 8 + rollout_horizon: 5 + history_steps: 6 + planner_candidates: 8 + seed: 18 +optim: + epochs: 4 + batch_size: 2 + num_workers: 4 + lr: 0.0003 + weight_decay: 0.0001 +trainer: + policy_type: elastic_reveal + use_bf16: true + grad_clip_norm: 1.0 + freeze_backbone: true + gradient_checkpointing: false + plan_during_train: true + plan_during_eval: true + support_mode_conditioning: true + planner_mode: trainable + use_depth: true + use_world_model: true + use_role_tokens: true + compute_equivariance_probe: true +policy: + backbone: + model_name: openai/clip-vit-base-patch32 + hidden_dim: 512 + max_text_tokens: 32 + freeze_backbone: true + gradient_checkpointing: false + use_dummy_backbone: false + fusion: + hidden_dim: 512 + num_cameras: 3 + num_layers: 4 + num_heads: 8 + ff_dim: 2048 + dropout: 0.1 + proprio_dim: 32 + proprio_tokens: 1 + memory: + hidden_dim: 512 + action_dim: 14 + history_steps: 6 + scene_history_steps: 3 + belief_history_steps: 8 + num_layers: 2 + dropout: 0.1 + memory_bank_size: 4 + scene_bank_size: 2 + belief_bank_size: 2 + num_heads: 8 + max_history_steps: 8 + decoder: + hidden_dim: 512 + num_heads: 8 + num_layers: 4 + ff_dim: 2048 + dropout: 0.1 + chunk_size: 8 + action_dim: 14 + arm_action_dim: 7 + num_candidates: 8 + num_phases: 5 + num_arm_roles: 4 + num_proposal_modes: 6 + planner_top_k: 4 + reveal_head: + hidden_dim: 512 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + belief_map_size: 32 + field_size: 16 + num_heads: 8 + predict_belief_map: true + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + world_model: + hidden_dim: 512 + action_dim: 14 + num_support_modes: 3 + num_approach_templates: 32 + rollout_horizon: 5 + field_size: 16 + num_heads: 8 + num_phases: 5 + num_arm_roles: 4 + num_interaction_tokens: 8 + belief_map_size: 32 + predict_belief_map: true + scene_bank_size: 2 + belief_bank_size: 2 + planner: + hidden_dim: 512 + num_candidates: 8 + action_dim: 14 + num_support_modes: 3 + utility_margin: 0.1 + num_heads: 8 + num_layers: 2 + num_phases: 5 + num_arm_roles: 4 + top_k: 4 +loss_weights: + action: 1.0 + phase: 0.1 + arm_role: 0.15 + support_mode: 0.1 + corridor: 0.15 + persistence: 0.05 + disturbance: 0.05 + world_model: 0.25 + belief: 0.05 + visibility: 0.05 + clearance: 0.05 + support_stability: 0.05 + reocclusion: 0.05 + occluder_contact: 0.05 + grasp_affordance: 0.05 + planner_success: 0.25 + planner_risk: 0.1 + planner_ranking: 0.2 + proposal_reconstruction: 0.1 + proposal_success: 0.15 + proposal_ranking: 0.2 + proposal_diversity: 0.05 + role_swap_consistency: 0.05 diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/diagnostics_full/proxy_diagnostics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/diagnostics_full/proxy_diagnostics.json new file mode 100644 index 0000000000000000000000000000000000000000..47717f873e78fbe6821e9076495a55f15214b0d4 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/diagnostics_full/proxy_diagnostics.json @@ -0,0 +1,16 @@ +{ + "planner_top1_accuracy": 0.32575757575757575, + "planner_regret": 0.013780632056295872, + "planner_score_utility_spearman": 0.2590909004211426, + "risk_calibration_mse": 0.011394849047064781, + "role_collapse_rate": 0.0, + "proposal_diversity": 0.021189916878938675, + "left_right_equivariance_error": 4.1925632985746205e-05, + "belief_calibration_brier": 0.004418548196554184, + "reocclusion_calibration_brier": 0.2664291560649872, + "support_stability_mae": 0.03443893417716026, + "clearance_auc": 0.7519833077975241, + "memory_write_rate": 0.0, + "memory_saturation": 0.3426786959171295, + "num_samples": 132 +} \ No newline at end of file diff --git a/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/metrics.json b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/metrics.json new file mode 100644 index 0000000000000000000000000000000000000000..d2a93ea85f23aad4210a8f0ceb474539d9c90cb8 --- /dev/null +++ b/artifacts/outputs/r3d/proxy_interaction_r3d_stage3_clip_rgbd_seed18/metrics.json @@ -0,0 +1,230 @@ +[ + { + "epoch": 0, + "train": { + "action": 0.025857772268549933, + "arm_role": 0.03370982127663977, + "belief": 0.11251551973882146, + "clearance": 0.09382800325868329, + "corridor": 0.27670082988227224, + "disturbance": 0.0059105119088080135, + "grasp_affordance": 0.01767918948786265, + "occluder_contact": 0.20790174291396016, + "persistence": 7.249429424941852, + "phase": 0.7393677396924084, + "planner_ranking": 0.18070141599557474, + "planner_risk": 0.014856026590806651, + "planner_success": 0.6360785897177551, + "proposal_diversity": 0.0, + "proposal_ranking": 1.2621668352506548, + "proposal_reconstruction": 0.06770114458278212, + "proposal_success": 0.6843253252393913, + "reocclusion": 0.733561779504047, + "role_swap_consistency": 0.0005007284111038572, + "support_mode": 0.7864344731675392, + "support_stability": 0.15464979819270797, + "total": 1.8893053952936103, + "uncertainty": 0.026158929999575815, + "visibility": 0.10713219952754949, + "world_model": 2.6855637587177816 + }, + "val": { + "action": 0.023218068632889877, + "arm_role": 7.036943040447747e-06, + "belief": 0.09010097717471195, + "clearance": 0.07386604707800981, + "corridor": 0.2412589482511535, + "disturbance": 0.002226812243944468, + "grasp_affordance": 0.009947083184890675, + "occluder_contact": 0.1953272647929914, + "persistence": 4.425601812926206, + "phase": 0.7730295369119355, + "planner_ranking": 0.04223714639947778, + "planner_risk": 0.01139929191435151, + "planner_success": 0.5929040561119715, + "proposal_diversity": 0.0, + "proposal_ranking": 1.185825102257006, + "proposal_reconstruction": 0.06571822122416714, + "proposal_success": 0.6717942483497389, + "reocclusion": 0.6932443497758923, + "role_swap_consistency": 0.0, + "support_mode": 0.726759416587425, + "support_stability": 0.14309178433860792, + "total": 1.6142820137919802, + "uncertainty": 0.007038127842613242, + "visibility": 0.08926947330209342, + "world_model": 2.4654864957838347 + } + }, + { + "epoch": 1, + "train": { + "action": 0.022110589677516702, + "arm_role": 1.3950919605674544e-05, + "belief": 0.10294150522086008, + "clearance": 0.08084176621199902, + "corridor": 0.2462356197038246, + "disturbance": 0.0031714126446295934, + "grasp_affordance": 0.00960333755054555, + "occluder_contact": 0.20049099460322195, + "persistence": 4.177739099682314, + "phase": 0.7043367555628273, + "planner_ranking": 0.04967470209174159, + "planner_risk": 0.01318734941372638, + "planner_success": 0.5723307314499511, + "proposal_diversity": 0.0, + "proposal_ranking": 1.150115387289936, + "proposal_reconstruction": 0.0644153186281002, + "proposal_success": 0.6753117866541078, + "reocclusion": 0.6987067278455065, + "role_swap_consistency": 0.00042066709767766197, + "support_mode": 0.6952358066099477, + "support_stability": 0.14108095217145084, + "total": 1.5119679860419628, + "uncertainty": 0.004842953095748942, + "visibility": 0.1003879376184878, + "world_model": 2.1801892893476635 + }, + "val": { + "action": 0.022499137826151018, + "arm_role": 6.598947992174406e-05, + "belief": 0.10226583751765164, + "clearance": 0.07544161554313067, + "corridor": 0.24163203528433136, + "disturbance": 0.001789042631902372, + "grasp_affordance": 0.008704299170693213, + "occluder_contact": 0.19236745504718838, + "persistence": 4.6868142503680605, + "phase": 0.6602843403816223, + "planner_ranking": 0.04781789132296884, + "planner_risk": 0.011765290400724167, + "planner_success": 0.5356740147778483, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1275392030224656, + "proposal_reconstruction": 0.06496855588347623, + "proposal_success": 0.6853868401411808, + "reocclusion": 0.7616267091397083, + "role_swap_consistency": 0.0, + "support_mode": 0.6959606806437174, + "support_stability": 0.14195433682338757, + "total": 1.5516490313139828, + "uncertainty": 0.002906694681962218, + "visibility": 0.09106272707382838, + "world_model": 2.2986758297139946 + } + }, + { + "epoch": 2, + "train": { + "action": 0.02224867461317497, + "arm_role": 1.0064135047153653e-05, + "belief": 0.101081568201643, + "clearance": 0.08345475727545966, + "corridor": 0.2517957490776222, + "disturbance": 0.0029570121168599525, + "grasp_affordance": 0.009918136390520952, + "occluder_contact": 0.20453518028346657, + "persistence": 4.101219920899855, + "phase": 0.6840078943062827, + "planner_ranking": 0.039329256105235716, + "planner_risk": 0.01054730894868701, + "planner_success": 0.5327235887031905, + "proposal_diversity": 0.0, + "proposal_ranking": 1.13903670847728, + "proposal_reconstruction": 0.0643397857342403, + "proposal_success": 0.6784668517986517, + "reocclusion": 0.7002001897826868, + "role_swap_consistency": 0.0007044284810045129, + "support_mode": 0.6855724394633508, + "support_stability": 0.14184507578416333, + "total": 1.4588422775268555, + "uncertainty": 0.00190604528785849, + "visibility": 0.0957828067360124, + "world_model": 2.046591355538493 + }, + "val": { + "action": 0.022229419372072727, + "arm_role": 3.2403206591190994e-06, + "belief": 0.09474777622204839, + "clearance": 0.0783396718854254, + "corridor": 0.23935590486860636, + "disturbance": 0.0021153995247996963, + "grasp_affordance": 0.0088278352649826, + "occluder_contact": 0.19912470809438013, + "persistence": 3.8268655773365134, + "phase": 0.6582440318483295, + "planner_ranking": 0.03990731150164114, + "planner_risk": 0.011713616791618706, + "planner_success": 0.5134278662276991, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1207696849649602, + "proposal_reconstruction": 0.06454906855342966, + "proposal_success": 0.6825091883991704, + "reocclusion": 0.6904694817282937, + "role_swap_consistency": 0.0, + "support_mode": 0.654139602726156, + "support_stability": 0.1375041804699735, + "total": 1.509128962502335, + "uncertainty": 0.0008296288133627086, + "visibility": 0.0897342140475909, + "world_model": 2.3714701876495825 + } + }, + { + "epoch": 3, + "train": { + "action": 0.021563103183596857, + "arm_role": 9.534371460919605e-05, + "belief": 0.11680854938885304, + "clearance": 0.09178280478333616, + "corridor": 0.3002950780047043, + "disturbance": 0.003815645619568824, + "grasp_affordance": 0.019201423960654992, + "occluder_contact": 0.2269966553955178, + "persistence": 3.9096838283699666, + "phase": 0.6496198564299738, + "planner_ranking": 0.03442381335639937, + "planner_risk": 0.011457197463837164, + "planner_success": 0.5073940541341667, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1320187015683238, + "proposal_reconstruction": 0.06363906914182983, + "proposal_success": 0.6781648984130141, + "reocclusion": 0.6639381964491066, + "role_swap_consistency": 0.0008551519205571269, + "support_mode": 0.6192906564442899, + "support_stability": 0.15393702852593355, + "total": 1.4377938396643595, + "uncertainty": 0.007347671914954727, + "visibility": 0.11880907560674307, + "world_model": 2.0381793492127462 + }, + "val": { + "action": 0.025203945693757498, + "arm_role": 1.054821041179821e-06, + "belief": 0.11612348100452712, + "clearance": 0.10017752489357283, + "corridor": 0.3126967510942257, + "disturbance": 0.005627329672058819, + "grasp_affordance": 0.018004601313309235, + "occluder_contact": 0.2143698472416762, + "persistence": 3.829360609704798, + "phase": 0.7265884849158201, + "planner_ranking": 0.03285199802378498, + "planner_risk": 0.011394622192407647, + "planner_success": 0.4938382957920884, + "proposal_diversity": 0.0, + "proposal_ranking": 1.1160721056389087, + "proposal_reconstruction": 0.06633412369498701, + "proposal_success": 0.6799869302547339, + "reocclusion": 0.6805009720000353, + "role_swap_consistency": 0.0, + "support_mode": 0.6576023919112755, + "support_stability": 0.16122697604199251, + "total": 1.4556549372095051, + "uncertainty": 0.007743606238033284, + "visibility": 0.12334947400923932, + "world_model": 2.0783897286111657 + } + } +] \ No newline at end of file diff --git a/artifacts/outputs/r3d_smoke/smoke_checks.json b/artifacts/outputs/r3d_smoke/smoke_checks.json new file mode 100644 index 0000000000000000000000000000000000000000..5003b8224d87f419882be0ae52b2401a1320a0e4 --- /dev/null +++ b/artifacts/outputs/r3d_smoke/smoke_checks.json @@ -0,0 +1,225 @@ +{ + "proxy": { + "losses": { + "action": 0.464199423789978, + "phase": 1.8022403717041016, + "arm_role": 3.169889450073242, + "support_mode": 1.0952036380767822, + "corridor": 0.6662508249282837, + "persistence": 3.4530017375946045, + "disturbance": 0.19386449456214905, + "belief": 0.7597116827964783, + "visibility": 0.0, + "clearance": 0.0, + "support_stability": 0.0, + "occluder_contact": 0.0, + "grasp_affordance": 0.0, + "reocclusion": 0.67914879322052, + "uncertainty": 0.8255214691162109, + "world_model": 8.876996040344238, + "planner_success": 0.7317572236061096, + "planner_risk": 0.0804181918501854, + "planner_ranking": 1.3189352750778198, + "proposal_reconstruction": 0.5751029849052429, + "proposal_success": 0.7016865015029907, + "proposal_ranking": 0.8174347877502441, + "proposal_diversity": 0.0, + "role_swap_consistency": 0.0, + "total": 2.791285991668701 + }, + "grad_norm": 3.0634233951568604, + "candidate_shape": [ + 2, + 8, + 8, + 14 + ], + "rollout_phase_shape": [ + 2, + 8, + 8, + 5 + ] + }, + "rlbench": { + "losses": { + "action": 0.9079780578613281, + "world_model": 0.0, + "planner_success": 0.0, + "planner_risk": 0.0, + "planner_ranking": 0.0, + "proposal_reconstruction": 0.0, + "proposal_success": 0.0, + "proposal_ranking": 0.0, + "proposal_diversity": 0.0, + "role_swap_consistency": 0.0, + "total": 0.9079780578613281 + }, + "grad_norm": 3.2202138900756836, + "candidate_shape": [ + 2, + 4, + 8, + 14 + ], + "rollout_phase_shape": [ + 2, + 4, + 8, + 5 + ], + "planner_enabled_for_eval": true, + "frozen_modules": [ + "interaction_head", + "world_model", + "planner" + ] + }, + "elastic_reveal": { + "rgb_only_candidate_shape": [ + 2, + 4, + 8, + 14 + ], + "rgbd_candidate_shape": [ + 2, + 4, + 8, + 14 + ], + "rgbd_topk_shape": [ + 2, + 4 + ], + "rgbd_rollout_shape": [ + 2, + 4, + 8, + 1, + 16, + 16 + ], + "noplanner_chunk_shape": [ + 2, + 8, + 14 + ], + "equivariance_probe_shape": [ + 2, + 8, + 14 + ], + "dataset_v6_missing_fields": [] + }, + "policy_config": { + "backbone": { + "model_name": "openai/clip-vit-base-patch32", + "hidden_dim": 64, + "max_text_tokens": 32, + "freeze_backbone": true, + "gradient_checkpointing": false, + "use_dummy_backbone": true, + "depth_patch_size": 16, + "geometry_feature_dim": 8, + "use_camera_geometry": true + }, + "fusion": { + "hidden_dim": 64, + "num_cameras": 3, + "num_layers": 2, + "num_heads": 4, + "ff_dim": 128, + "dropout": 0.1, + "proprio_dim": 32, + "proprio_tokens": 1, + "geometry_num_heads": 4 + }, + "memory": { + "hidden_dim": 64, + "action_dim": 14, + "history_steps": 6, + "num_layers": 2, + "dropout": 0.1, + "memory_bank_size": 4, + "num_heads": 4, + "max_history_steps": 8, + "scene_bank_size": 2, + "belief_bank_size": 2, + "scene_history_steps": 3, + "belief_history_steps": 8, + "memory_write_threshold": 0.45, + "memory_suppression_margin": 0.05 + }, + "decoder": { + "hidden_dim": 64, + "num_heads": 4, + "num_layers": 2, + "ff_dim": 128, + "dropout": 0.1, + "chunk_size": 8, + "action_dim": 14, + "arm_action_dim": 7, + "num_candidates": 4, + "num_phases": 5, + "num_arm_roles": 4, + "num_proposal_modes": 6, + "planner_top_k": 4 + }, + "reveal_head": { + "hidden_dim": 64, + "num_support_modes": 3, + "num_approach_templates": 32, + "rollout_horizon": 3, + "belief_map_size": 32, + "field_size": 16, + "num_heads": 4, + "predict_belief_map": true, + "num_phases": 5, + "num_arm_roles": 4, + "num_interaction_tokens": 8 + }, + "world_model": { + "hidden_dim": 64, + "action_dim": 14, + "num_support_modes": 3, + "num_approach_templates": 32, + "rollout_horizon": 3, + "field_size": 16, + "num_heads": 4, + "num_phases": 5, + "num_arm_roles": 4, + "num_interaction_tokens": 8, + "belief_map_size": 32, + "predict_belief_map": true, + "scene_bank_size": 2, + "belief_bank_size": 2 + }, + "planner": { + "hidden_dim": 64, + "num_candidates": 4, + "action_dim": 14, + "num_support_modes": 3, + "utility_margin": 0.1, + "corridor_weight": 1.0, + "persistence_weight": 0.5, + "proposal_weight": 0.5, + "task_progress_weight": 0.75, + "disturbance_weight": 0.75, + "reocclusion_weight": 0.5, + "visibility_weight": 0.25, + "num_heads": 4, + "num_layers": 2, + "num_phases": 5, + "num_arm_roles": 4, + "top_k": 4, + "belief_gain_weight": 1.0, + "visibility_gain_weight": 0.75, + "clearance_weight": 0.75, + "occluder_contact_weight": 0.5, + "grasp_affordance_weight": 0.75, + "support_stability_weight": 0.5, + "residual_weight": 0.5 + } + } +} \ No newline at end of file