diff --git a/.gitattributes b/.gitattributes index 394fa0b0887c05eb737a449b59d2efadda52cd67..f2586691102baab452c523a97358a8b8b22d2034 100644 --- a/.gitattributes +++ b/.gitattributes @@ -43,3 +43,4 @@ EMS-superquadric_fitting_inference/src/EMS/__pycache__/EMS_recovery.Distance-286 EMS-superquadric_fitting_inference/src/EMS/__pycache__/EMS_recovery.SwitchCost-265.py312.1.nbc filter=lfs diff=lfs merge=lfs -text EMS-superquadric_fitting_inference/src/EMS/__pycache__/EMS_recovery.SimilarityCandidates-138.py312.1.nbc filter=lfs diff=lfs merge=lfs -text EMS-superquadric_fitting_inference/src/EMS/__pycache__/EMS_recovery.CostFunc-307.py311.1.nbc filter=lfs diff=lfs merge=lfs -text +nano_WaveGen/checkpoints_text2wave/losses.png filter=lfs diff=lfs merge=lfs -text diff --git a/nano_WaveGen/.gitignore b/nano_WaveGen/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c124854da88a0aec857356ab46608d62b576fb0e --- /dev/null +++ b/nano_WaveGen/.gitignore @@ -0,0 +1,45 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.pyc + +# PyTorch +*.pth +*.pt +checkpoint*.pth +best_model*.pth +final_model.pth + +# Logs and outputs +logs/ +*.log +core_space/ +outputs/ + +# Jupyter Notebook +.ipynb_checkpoints/ + +# Environment +.env +venv/ + + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Temporary files +*.tmp +*.bak + +data/movi_a_128x128 +wandb +*.npz \ No newline at end of file diff --git a/nano_WaveGen/checkpoints_text2wave/best.pt b/nano_WaveGen/checkpoints_text2wave/best.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa3dda5e17b4bf71052c3b5e534f18e6bf2bc908 --- /dev/null +++ b/nano_WaveGen/checkpoints_text2wave/best.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a83304beb70d3e77c15709df1d486ce35e2a32dbebc13285914d763a54e74ee +size 926425298 diff --git a/nano_WaveGen/checkpoints_text2wave/latest.pt b/nano_WaveGen/checkpoints_text2wave/latest.pt new file mode 100644 index 0000000000000000000000000000000000000000..be5c54bb85b4eae5656d8ffb4868ac51981e31df --- /dev/null +++ b/nano_WaveGen/checkpoints_text2wave/latest.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f5e09c520f8fde415c6f95a7347dd0d3fa49f03fb1342da95f04a436c736567 +size 926458332 diff --git a/nano_WaveGen/checkpoints_text2wave/losses.png b/nano_WaveGen/checkpoints_text2wave/losses.png new file mode 100644 index 0000000000000000000000000000000000000000..f1c6835b6c82f9916b7f0ab90380ef6c2d808e32 --- /dev/null +++ b/nano_WaveGen/checkpoints_text2wave/losses.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74f7b41963b5ce8524acc2c5e2bb53d5f332494838411f710f53df54e52908c6 +size 269025 diff --git a/nano_WaveGen/checkpoints_text2wave/training_log.txt b/nano_WaveGen/checkpoints_text2wave/training_log.txt new file mode 100644 index 0000000000000000000000000000000000000000..547ec705a60f032b912ee2fdfcc8e05cd8a3bb7c --- /dev/null +++ b/nano_WaveGen/checkpoints_text2wave/training_log.txt @@ -0,0 +1,9 @@ +2025-12-05 18:40:09 ============================================================ +2025-12-05 18:40:09 Dataset Information: +2025-12-05 18:40:09 - Training samples: 100 +2025-12-05 18:40:09 - Batch size: 24 +2025-12-05 18:40:09 - Steps per epoch (full dataset): 5 +2025-12-05 18:40:09 - Total training steps: 50000 +2025-12-05 18:40:09 - Will traverse dataset: 10000.00 times +2025-12-05 18:40:09 ============================================================ +2025-12-05 18:41:07 New best checkpoint at step 5: train_loss=2085.487305 diff --git a/nano_WaveGen/checkpoints_text2wave/training_stats.npz b/nano_WaveGen/checkpoints_text2wave/training_stats.npz new file mode 100644 index 0000000000000000000000000000000000000000..df0706680cf21a61b11d34b6a569d03f31ce6f1f --- /dev/null +++ b/nano_WaveGen/checkpoints_text2wave/training_stats.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc87b2676cd86e2d19f72c320931e39eeba832d8e385b937c4f10c7261a752cf +size 3496 diff --git a/nano_WaveGen/configs/accelerate_config.yaml b/nano_WaveGen/configs/accelerate_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..67b4391588251288c67c1c1f3aa228b6e1656be3 --- /dev/null +++ b/nano_WaveGen/configs/accelerate_config.yaml @@ -0,0 +1,15 @@ +compute_environment: LOCAL_MACHINE +distributed_type: MULTI_GPU +downcast_bf16: 'no' +gpu_ids: all +machine_rank: 0 +main_training_function: main +mixed_precision: fp16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false \ No newline at end of file diff --git a/nano_WaveGen/configs/default.yaml b/nano_WaveGen/configs/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..29e9e73f332430540d23e9accd4bd9054feb49bd --- /dev/null +++ b/nano_WaveGen/configs/default.yaml @@ -0,0 +1,154 @@ +# configs/default.yaml + +# ==================== 核心配置 ==================== + +# 训练配置 +training: + batch_size: 24 # Reduced for faster first step (8 per GPU) + + # 基于步数的训练(v30新增) + max_steps: 50000 # 总训练步数(可以根据需要调整) + + # 时序预测配置 + max_history_frames: 3 # 最大历史帧数(1-3帧) + bidirectional_training: true # 双向预测训练(从中间帧开始) + # max_prediction_frames: 1 # 由模型覆盖整段序列,这里保留用于兼容 + initial_teacher_forcing_prob: 0.5 # 初始锚点帧教师强制概率 + initial_frame: + strategy: "middle" # 可选: middle | fixed | random + offset: 0 # 在基础策略上的偏移 + random_history_sampling: true # 是否在推理时随机选择0-历史帧数量作为参考帧 + freeze_static_from_anchor: true # 是否在预测序列时固定0-5维的静态参数沿用锚点帧 + multi_sample_attempts: 5 # 每个样本的随机尝试次数(仅训练时启用) + decoder_noise_std: 0.2 # 解码阶段加入的高斯噪声标准差(0表示禁用) + frame_rate: 8.0 # MOVi 样本在预处理阶段统一为 8fps,速度计算与训练假设保持一致 + + # 调试和日志配置 + debug_print_interval: 1 # 每多少步打印一次调试信息(损失值等) + log_interval: 50 # 每多少步记录一次日志 + + learning_rate: 0.001 # Standard T5 learning rate + gradient_clip_val: 1.0 # Gradient clipping value + + # GPU配置 + use_multi_gpu: true # 启用多GPU训练 + gpu_list: [0, 1, 2, 3, 4, 5, 6, 7] # 可用GPU列表 + use_free_gpus: true # 自动选择空闲GPU + + # 保存训练的模型和生成结果 配置 + evaluation: + max_batches: 0 # 例如0禁用验证,只比较训练 loss。改成 None 就是验证全量;任何正整数则限制评估批次数。 + + save_generation: + enabled: true #保存训练的模型 + save_gt: true # 是否保存GT数据 + fixed_samples: 5 # 固定样本数量,用于对比 + save_interval: 100 # 每100步保存一次 + save_dir: "core_space" # 保存目录 + + +# ==================== Text2Wave 配置 ==================== + +# 模型设置 +text2wave_model: + # 原始模型: google/long-t5-tglobal-base + model_name: "google/t5-v1_1-small" + +# 损失函数配置 +loss: + + # 损失权重 + weights: + wave_loss: 4.0 # 波损失(超二次元参数)权重 + wave_contrastive_loss: 2.0 # 序列级对比损失权重 + world_info_loss: 0.5 # 世界信息损失(相机,缩放,时间)权重 + controllable_info_loss: 0.1 # 可控制信息损失(质量,摩擦,弹性)权重 + pla_loss: 3.0 # 最小作用量约束损失权重 + + wave_contrastive: + temperature: 0.2 # 对比分布温度 + +# 数据配置 +data: + # MOVi数据集配置 + num_workers: 32 # 数据加载线程数 + max_sequences: 100 # 最大序列数,-1表示使用所有数据,设置较小值用于快速测试 + +physics: + gravity: 9.81 # 自由落体重力加速度(单位:m/s^2) + collision_buffer: 1.05 # 判定碰撞时的半径放大系数 + +# ==================== Wave2Pixel 配置 ==================== + +# 网格配置 +grid: + size: 64 # 3D网格分辨率 + prob_threshold: 0.5 + +# 世界坐标系配置 +world_coordinate_system: + enabled: true # 是否启用世界坐标系 + world_scale: 10.0 # 世界坐标范围 ±10米 + voxel_size: 0.05 # 体素大小 5cm + near_plane: 0.1 # 近平面距离 + far_plane: 50.0 # 远平面距离 + predict_world_scale: true # 让模型预测世界缩放比例 + world_scale_loss_weight: 0.1 # 世界缩放比例损失权重 - 增加到1.0以加快学习 + +# 相机配置 +camera: + default_view: "front" + fov: 60 + near: 0.1 + far: 100.0 + # 世界坐标系中的相机位置 + views: + front: [0, 0, 2] + back: [0, 0, -2] + left: [-2, 0, 0] + right: [2, 0, 0] + top: [0, 2, 0] + bottom: [0, -2, 0] + # 相机旋转角度 (pitch, yaw, roll) + view_rotations: + front: [0, 0, 0] + back: [0, 3.14159, 0] # 180度旋转 + left: [0, -1.5708, 0] # -90度旋转 + right: [0, 1.5708, 0] # 90度旋转 + top: [-1.5708, 0, 0] # -90度俯视 + bottom: [1.5708, 0, 0] # 90度仰视 + +# 生成配置 +generation: + mode: "image" # "image" 或 "video" + time: + start: 0.0 + end: 12.0 + fps: 30 # 外部观察频率 + timestep: 0.0 # 用于单帧图像生成 + compute_wsf: false # 是否默认计算完整WSF场 + output_dir: "core_space" # 默认输出目录 + +# 输出格式配置 +output: + format: "triple_channel" # 可选: "complex", "dual_channel", "triple_channel" + third_channel: "amplitude" # 如果format为"triple_channel",第三通道的内容: "amplitude", "phase", "none" + +# Wave2Pixel相关的模型组件 +model: + wave_encoder: + hidden_dim: 256 + dropout: 0.1 + + feature_extractor: + input_dim: 4 # 实部、虚部、振幅、相位 + hidden_dim: 64 + output_dim: 32 + dropout: 0.1 + + # 重命名为pixel_net以匹配代码中的使用 + pixel_net: + channels: [32, 64, 128, 64, 4] # 最后4通道: RGB + 概率 + kernel_size: 3 + padding: 1 + dropout: 0.1 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/batch_statistics.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/batch_statistics.json new file mode 100644 index 0000000000000000000000000000000000000000..9287060653c20fdd8953b667584831472f1c7f33 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/batch_statistics.json @@ -0,0 +1,151 @@ +{ + "batch_size": 5, + "num_frames": 24, + "timestamp": "2025-12-05T18:42:58.362258", + "objects_per_frame": [ + { + "frame": 0, + "mean_objects": 3.4, + "max_objects": 5, + "min_objects": 2 + }, + { + "frame": 1, + "mean_objects": 3.6, + "max_objects": 6, + "min_objects": 2 + }, + { + "frame": 2, + "mean_objects": 3.8, + "max_objects": 6, + "min_objects": 3 + }, + { + "frame": 3, + "mean_objects": 3.6, + "max_objects": 6, + "min_objects": 2 + }, + { + "frame": 4, + "mean_objects": 4.0, + "max_objects": 7, + "min_objects": 3 + }, + { + "frame": 5, + "mean_objects": 3.8, + "max_objects": 7, + "min_objects": 2 + }, + { + "frame": 6, + "mean_objects": 4.0, + "max_objects": 7, + "min_objects": 3 + }, + { + "frame": 7, + "mean_objects": 3.6, + "max_objects": 6, + "min_objects": 2 + }, + { + "frame": 8, + "mean_objects": 4.2, + "max_objects": 8, + "min_objects": 3 + }, + { + "frame": 9, + "mean_objects": 3.6, + "max_objects": 7, + "min_objects": 2 + }, + { + "frame": 10, + "mean_objects": 3.4, + "max_objects": 5, + "min_objects": 2 + }, + { + "frame": 11, + "mean_objects": 3.6, + "max_objects": 7, + "min_objects": 2 + }, + { + "frame": 12, + "mean_objects": 3.4, + "max_objects": 6, + "min_objects": 2 + }, + { + "frame": 13, + "mean_objects": 3.8, + "max_objects": 7, + "min_objects": 2 + }, + { + "frame": 14, + "mean_objects": 2.6, + "max_objects": 6, + "min_objects": 0 + }, + { + "frame": 15, + "mean_objects": 3.2, + "max_objects": 6, + "min_objects": 1 + }, + { + "frame": 16, + "mean_objects": 2.8, + "max_objects": 7, + "min_objects": 1 + }, + { + "frame": 17, + "mean_objects": 2.8, + "max_objects": 5, + "min_objects": 1 + }, + { + "frame": 18, + "mean_objects": 3.0, + "max_objects": 6, + "min_objects": 1 + }, + { + "frame": 19, + "mean_objects": 2.8, + "max_objects": 5, + "min_objects": 1 + }, + { + "frame": 20, + "mean_objects": 2.4, + "max_objects": 4, + "min_objects": 0 + }, + { + "frame": 21, + "mean_objects": 3.0, + "max_objects": 5, + "min_objects": 1 + }, + { + "frame": 22, + "mean_objects": 2.8, + "max_objects": 5, + "min_objects": 1 + }, + { + "frame": 23, + "mean_objects": 3.0, + "max_objects": 5, + "min_objects": 1 + } + ] +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/error_statistics.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/error_statistics.json new file mode 100644 index 0000000000000000000000000000000000000000..c1670e98804e0b00825e16c3ed5ca05066f972e6 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/error_statistics.json @@ -0,0 +1,4 @@ +{ + "object_mae": 1.520950198173523, + "world_mae": 3.048270600940062 +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/info.txt b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/info.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3511b7c97199b58178a308bc84d145f7ae9cffe --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/info.txt @@ -0,0 +1,10 @@ +Text: large yellow metal cylinder, small yellow metal cylinder, small gray metal sphere +Generated at step: 5 +Number of frames: 24 +Sequence: sample_00000 + +--- Model Output Summary --- +Max objects: 10 +Object parameters: 15 (exists + shape[2] + scale[3] + translation[3] + rotation[3] + velocity[3]) +World parameters: 8 (camera_pos[3] + camera_quat[4] + scene_scale[1]) +Physics parameters: 3 (mass + friction + restitution) diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/Full_Sample_Data_for_Learning_Target.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/Full_Sample_Data_for_Learning_Target.npz new file mode 100644 index 0000000000000000000000000000000000000000..40c2808c1093cc3239a85e77aa19f571bb7d5c6d --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/Full_Sample_Data_for_Learning_Target.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a99d3bd56b39f896e3f1febab643edf5c312fbc73d82073da95288a8b61a0ee +size 30977 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/camera_trajectory.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/camera_trajectory.npz new file mode 100644 index 0000000000000000000000000000000000000000..cb51fe050ac11a61d7c54c27ff5232f5dd33e4dc --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/camera_trajectory.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4fd4cdb82402db1d9b33317c378ad09319a97c25c846d6d7435e4f7ff1d9786 +size 450 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/depth/depth_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/depth/depth_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..b59f682498fecae79ccf2a0d868f6baf977c88c0 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/depth/depth_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f17e9fda8898016fa785a6d002e2fb865c4945f38a486be7520263cf4704e02f +size 2225336 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/file_manifest.txt b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/file_manifest.txt new file mode 100644 index 0000000000000000000000000000000000000000..978cbea980349831cedd42e395a42bdb99b1301a --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/file_manifest.txt @@ -0,0 +1,38 @@ +Original sequence: sample_00000 +Data split: validation +Original path: ../data/movi_a_128x128/validation/sample_00000 +Copied at: 2025-12-05T18:42:54.238918 + +Files included: +- Full_Sample_Data_for_Learning_Target.npz +- camera_trajectory.npz +- depth/depth_merge.npz +- metadata.json +- normal/normal_merge.npz +- object_coordinates/object_coordinates_merge.npz +- point_clouds/point_clouds_merge.npz +- rgb/frame_000.png +- rgb/frame_001.png +- rgb/frame_002.png +- rgb/frame_003.png +- rgb/frame_004.png +- rgb/frame_005.png +- rgb/frame_006.png +- rgb/frame_007.png +- rgb/frame_008.png +- rgb/frame_009.png +- rgb/frame_010.png +- rgb/frame_011.png +- rgb/frame_012.png +- rgb/frame_013.png +- rgb/frame_014.png +- rgb/frame_015.png +- rgb/frame_016.png +- rgb/frame_017.png +- rgb/frame_018.png +- rgb/frame_019.png +- rgb/frame_020.png +- rgb/frame_021.png +- rgb/frame_022.png +- rgb/frame_023.png +- segmentation/segmentation_merge.npz diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/metadata.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..4e3239bbb847b1722ea6f7fe94dd07b160ad2652 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/metadata.json @@ -0,0 +1,78 @@ +{ + "num_frames": 24, + "num_instances": 3, + "resolution": 128, + "depth_range": [ + 7.109129905700684, + 67.49893951416016 + ], + "camera": { + "focal_length": 35.0, + "sensor_width": 32.0, + "field_of_view": 0.8575560450553894, + "K": [ + [ + 140.0, + 0.0, + 64.0 + ], + [ + 0.0, + 140.0, + 64.0 + ], + [ + 0.0, + 0.0, + 1.0 + ] + ] + }, + "instances": [ + { + "id": 1, + "shape": "cylinder", + "size": "large", + "color": "yellow", + "color_rgb": [ + 1.0, + 0.9333333373069763, + 0.019607843831181526 + ], + "material": "metal", + "mass": 5.784790992736816, + "friction": 0.4000000059604645, + "restitution": 0.30000001192092896 + }, + { + "id": 2, + "shape": "cylinder", + "size": "small", + "color": "yellow", + "color_rgb": [ + 1.0, + 0.9333333373069763, + 0.019607843831181526 + ], + "material": "metal", + "mass": 0.723098874092102, + "friction": 0.4000000059604645, + "restitution": 0.30000001192092896 + }, + { + "id": 3, + "shape": "sphere", + "size": "small", + "color": "gray", + "color_rgb": [ + 0.34117648005485535, + 0.34117648005485535, + 0.34117648005485535 + ], + "material": "metal", + "mass": 0.483887255191803, + "friction": 0.4000000059604645, + "restitution": 0.30000001192092896 + } + ] +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/normal/normal_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/normal/normal_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..35effc98fcebe6651b8c0980cbff7fc3e16f6878 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/normal/normal_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01026452044e3bccdf88dae92dca52d818e95795b4e813b5ce567604de090a23 +size 126632 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/object_coordinates/object_coordinates_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/object_coordinates/object_coordinates_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..3dee39f5c6811ea7a39f88da461ac7adf5039a40 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/object_coordinates/object_coordinates_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e29f3ba063dceb54307959ad67c82ba1b0803b2901515a5a17cbddb407c2661 +size 1620075 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/point_clouds/point_clouds_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/point_clouds/point_clouds_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..37eb6b2a99d938b172b53734a679ff0958db9876 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/point_clouds/point_clouds_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f3bc0a862953979939ec1e974e0f22cfcf78b09b384c20ba0a523f1b817fa86 +size 8503662 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_000.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_000.png new file mode 100644 index 0000000000000000000000000000000000000000..0d8cbc17b94809d6670d831d40f9ef069e9798c4 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_000.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_001.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_001.png new file mode 100644 index 0000000000000000000000000000000000000000..52bab5205771758cdfba3b6450c809bf1138fce6 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_001.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_002.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_002.png new file mode 100644 index 0000000000000000000000000000000000000000..dd4a3d7a28ed4d6746ec8663331e299761652c72 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_002.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_003.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_003.png new file mode 100644 index 0000000000000000000000000000000000000000..bf00bd24f786996b7d641fbfebc236d84294401e Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_003.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_004.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_004.png new file mode 100644 index 0000000000000000000000000000000000000000..379857fca762a13b197636b413a25ca4a0401b5a Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_004.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_005.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_005.png new file mode 100644 index 0000000000000000000000000000000000000000..75515e9e6d52ccb514f9c65f0a43ef7e15480c72 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_005.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_006.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_006.png new file mode 100644 index 0000000000000000000000000000000000000000..754ba710b8b60e63cbbee07e39f5abe244905b37 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_006.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_007.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_007.png new file mode 100644 index 0000000000000000000000000000000000000000..f394747c099ddf6cef4bcae5a0f8ff827257129a Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_007.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_008.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_008.png new file mode 100644 index 0000000000000000000000000000000000000000..446469cc99af3db74eb2e1fd39719a61f9207a09 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_008.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_009.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_009.png new file mode 100644 index 0000000000000000000000000000000000000000..edb43483d33c947ebb088eaf49b98f23aa5bb3d6 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_009.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_010.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_010.png new file mode 100644 index 0000000000000000000000000000000000000000..003ad717066ce6e1beb8b2d2821564f51fb3b87f Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_010.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_011.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_011.png new file mode 100644 index 0000000000000000000000000000000000000000..24a64a513f1e872004e301fdefed7f2e9bf9ea57 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_011.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_012.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_012.png new file mode 100644 index 0000000000000000000000000000000000000000..49dd0e18277f5781c4e38bfc6fe6745e130f9df7 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_012.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_013.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_013.png new file mode 100644 index 0000000000000000000000000000000000000000..0fa9be840a80f6d69f087fd2bfe1b442add1465c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_013.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_014.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_014.png new file mode 100644 index 0000000000000000000000000000000000000000..2bd45c6da480aeba76b51b632e7bb6a352cb4003 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_014.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_015.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_015.png new file mode 100644 index 0000000000000000000000000000000000000000..f7b080dad3b33428c345ac5a9c6c2bf9c555dd35 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_015.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_016.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_016.png new file mode 100644 index 0000000000000000000000000000000000000000..b92398c1eff58150c732acbd118b6bfcd0b48c7b Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_016.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_017.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_017.png new file mode 100644 index 0000000000000000000000000000000000000000..cdf32bd9241e88dd578b0d68411f4d49cc340fb2 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_017.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_018.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_018.png new file mode 100644 index 0000000000000000000000000000000000000000..d194ec94cae703a2930fd1091e4b021ea9c44561 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_018.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_019.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_019.png new file mode 100644 index 0000000000000000000000000000000000000000..2805103c312776724ea89f2d7be6f98e57ca16c9 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_019.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_020.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_020.png new file mode 100644 index 0000000000000000000000000000000000000000..d02cea5ebc9f67abc7b27c6c34c381dfa5688bfa Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_020.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_021.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_021.png new file mode 100644 index 0000000000000000000000000000000000000000..6a27b2afc16f73348fae583d038e99d9777341a7 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_021.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_022.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_022.png new file mode 100644 index 0000000000000000000000000000000000000000..ec361fda731a13c0c7ef7b84e493572f124c1077 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_022.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_023.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_023.png new file mode 100644 index 0000000000000000000000000000000000000000..8902e02baeda3da6c42dce22370244e82c3d01b3 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/rgb/frame_023.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/segmentation/segmentation_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/segmentation/segmentation_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..9bf1780d7e09327d5187f68329295b6f7985d910 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/original_data/segmentation/segmentation_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147566467e6cf53cd5478853e69d816161f9dd3e5b3198021a40de9e4405272d +size 9184 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/predictions.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/predictions.npz new file mode 100644 index 0000000000000000000000000000000000000000..2b10c407d9d5ae6e4cdd3fe6b045612fc44e4180 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/predictions.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36d0a5cf1afa6e06e29b9e6fe572848d6e0fbf9193e0c26a319bf6dc9d95653 +size 124404 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/targets.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/targets.npz new file mode 100644 index 0000000000000000000000000000000000000000..d9f43c649ef34e8c961ca418dfd3e4270db89cc3 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_0/targets.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bee90f8ec231b1c46f276e2de5164df93d2d09f7750f2d80964dc9fc15361408 +size 115668 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/error_statistics.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/error_statistics.json new file mode 100644 index 0000000000000000000000000000000000000000..e4a9817df633d4438083b141d431e83620eb13d7 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/error_statistics.json @@ -0,0 +1,4 @@ +{ + "object_mae": 1.5807031393051147, + "world_mae": 2.9718214426902705 +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/info.txt b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/info.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8babcb521fb34c619e2436df98ec4bf6f63f0d3 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/info.txt @@ -0,0 +1,10 @@ +Text: large purple metal cylinder, large blue rubber cylinder, small green metal cube, small red rubber sphere, small gray rubber cylinder, small blue rubber cylinder, small yellow rubber cube, small purple rubber sphere, small red metal cylinder +Generated at step: 5 +Number of frames: 24 +Sequence: sample_00001 + +--- Model Output Summary --- +Max objects: 10 +Object parameters: 15 (exists + shape[2] + scale[3] + translation[3] + rotation[3] + velocity[3]) +World parameters: 8 (camera_pos[3] + camera_quat[4] + scene_scale[1]) +Physics parameters: 3 (mass + friction + restitution) diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/Full_Sample_Data_for_Learning_Target.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/Full_Sample_Data_for_Learning_Target.npz new file mode 100644 index 0000000000000000000000000000000000000000..f8c55f06faa807c4e4b4c184a1bcb0336eb6dcab --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/Full_Sample_Data_for_Learning_Target.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da40da407c8b7f95f661a141ed6b6c32a340af5ab4d291c91889b89b5631504e +size 42494 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/camera_trajectory.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/camera_trajectory.npz new file mode 100644 index 0000000000000000000000000000000000000000..57bfc5eaa4750121a0cc578c8d8c2692cdecd9bc --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/camera_trajectory.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cfb7d3c4771eba542d817ce3b5a9eacb68d504a24efde175ef043c08cd7a9b4 +size 450 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/depth/depth_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/depth/depth_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..6a8791d87e410e4ff1db53ea45299ac9e575a6e1 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/depth/depth_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f45b046b60d325d9fe6710acbc59eb661e7fe4095a52096178177ff63df8d589 +size 2104698 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/file_manifest.txt b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/file_manifest.txt new file mode 100644 index 0000000000000000000000000000000000000000..4df5beacef20d74684e4f0a11d81e5f28defe091 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/file_manifest.txt @@ -0,0 +1,38 @@ +Original sequence: sample_00001 +Data split: validation +Original path: ../data/movi_a_128x128/validation/sample_00001 +Copied at: 2025-12-05T18:42:55.094762 + +Files included: +- Full_Sample_Data_for_Learning_Target.npz +- camera_trajectory.npz +- depth/depth_merge.npz +- metadata.json +- normal/normal_merge.npz +- object_coordinates/object_coordinates_merge.npz +- point_clouds/point_clouds_merge.npz +- rgb/frame_000.png +- rgb/frame_001.png +- rgb/frame_002.png +- rgb/frame_003.png +- rgb/frame_004.png +- rgb/frame_005.png +- rgb/frame_006.png +- rgb/frame_007.png +- rgb/frame_008.png +- rgb/frame_009.png +- rgb/frame_010.png +- rgb/frame_011.png +- rgb/frame_012.png +- rgb/frame_013.png +- rgb/frame_014.png +- rgb/frame_015.png +- rgb/frame_016.png +- rgb/frame_017.png +- rgb/frame_018.png +- rgb/frame_019.png +- rgb/frame_020.png +- rgb/frame_021.png +- rgb/frame_022.png +- rgb/frame_023.png +- segmentation/segmentation_merge.npz diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/metadata.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..dc581b5ec495284c5cf20a6d9b2b0baf3fb820f6 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/metadata.json @@ -0,0 +1,168 @@ +{ + "num_frames": 24, + "num_instances": 9, + "resolution": 128, + "depth_range": [ + 5.137400150299072, + 89.9398193359375 + ], + "camera": { + "focal_length": 35.0, + "sensor_width": 32.0, + "field_of_view": 0.8575560450553894, + "K": [ + [ + 140.0, + 0.0, + 64.0 + ], + [ + 0.0, + 140.0, + 64.0 + ], + [ + 0.0, + 0.0, + 1.0 + ] + ] + }, + "instances": [ + { + "id": 1, + "shape": "cylinder", + "size": "large", + "color": "purple", + "color_rgb": [ + 0.5058823823928833, + 0.14901961386203766, + 0.7529411911964417 + ], + "material": "metal", + "mass": 5.784790992736816, + "friction": 0.4000000059604645, + "restitution": 0.30000001192092896 + }, + { + "id": 2, + "shape": "cylinder", + "size": "large", + "color": "blue", + "color_rgb": [ + 0.16470588743686676, + 0.29411765933036804, + 0.843137264251709 + ], + "material": "rubber", + "mass": 2.356766700744629, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 3, + "shape": "cube", + "size": "small", + "color": "green", + "color_rgb": [ + 0.11372549086809158, + 0.4117647111415863, + 0.0784313753247261 + ], + "material": "metal", + "mass": 0.9201729893684387, + "friction": 0.4000000059604645, + "restitution": 0.30000001192092896 + }, + { + "id": 4, + "shape": "sphere", + "size": "small", + "color": "red", + "color_rgb": [ + 0.6784313917160034, + 0.13725490868091583, + 0.13725490868091583 + ], + "material": "rubber", + "mass": 0.19713924825191498, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 5, + "shape": "cylinder", + "size": "small", + "color": "gray", + "color_rgb": [ + 0.34117648005485535, + 0.34117648005485535, + 0.34117648005485535 + ], + "material": "rubber", + "mass": 0.2945958375930786, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 6, + "shape": "cylinder", + "size": "small", + "color": "blue", + "color_rgb": [ + 0.16470588743686676, + 0.29411765933036804, + 0.843137264251709 + ], + "material": "rubber", + "mass": 0.2945958375930786, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 7, + "shape": "cube", + "size": "small", + "color": "yellow", + "color_rgb": [ + 1.0, + 0.9333333373069763, + 0.019607843831181526 + ], + "material": "rubber", + "mass": 0.37488529086112976, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 8, + "shape": "sphere", + "size": "small", + "color": "purple", + "color_rgb": [ + 0.5058823823928833, + 0.14901961386203766, + 0.7529411911964417 + ], + "material": "rubber", + "mass": 0.19713924825191498, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 9, + "shape": "cylinder", + "size": "small", + "color": "red", + "color_rgb": [ + 0.6784313917160034, + 0.13725490868091583, + 0.13725490868091583 + ], + "material": "metal", + "mass": 0.723098874092102, + "friction": 0.4000000059604645, + "restitution": 0.30000001192092896 + } + ] +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/normal/normal_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/normal/normal_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..7db945b431a83d4e53b33e0bceaf8689fda77e84 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/normal/normal_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34586040d44568a754d9e499c4757b356e0187e5337182ce2325d4f31a025ed1 +size 293784 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/object_coordinates/object_coordinates_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/object_coordinates/object_coordinates_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..a6420faa50b291ef2076ee767f092eff0a76483c --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/object_coordinates/object_coordinates_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0754dd9ef2c0d2626622ae4808f2b80db0a5e904cbaf53ce196069317c91ac3 +size 1683329 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/point_clouds/point_clouds_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/point_clouds/point_clouds_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..754dfb2080e41f5d0bb757dfc4b48c6d85832cda --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/point_clouds/point_clouds_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fd812bc4676f540ea2ee6c42d21cd2423b3521914a87b5c0fa6179dc54bec10 +size 8801876 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_000.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_000.png new file mode 100644 index 0000000000000000000000000000000000000000..c9a667a212ba1b516696d2b329204f45593a6245 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_000.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_001.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_001.png new file mode 100644 index 0000000000000000000000000000000000000000..b34c1196d25b4fc5af26af6f738853302fdfb572 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_001.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_002.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_002.png new file mode 100644 index 0000000000000000000000000000000000000000..ecd7234fc1963cb75bdc4e15a0c8f999b9d0cb59 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_002.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_003.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_003.png new file mode 100644 index 0000000000000000000000000000000000000000..a42ba338624389e4b46ca1ab0e585a20e147b377 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_003.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_004.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_004.png new file mode 100644 index 0000000000000000000000000000000000000000..5703d7fdba245366e13a280402857e45855826e8 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_004.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_005.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_005.png new file mode 100644 index 0000000000000000000000000000000000000000..5123c72f6b856f98f1e655b874ce1afb8fd2b1a5 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_005.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_006.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_006.png new file mode 100644 index 0000000000000000000000000000000000000000..ba40d4fe322c67153659172a59584a4b23d077d9 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_006.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_007.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_007.png new file mode 100644 index 0000000000000000000000000000000000000000..b170e99e74295ce3afdf539dbe2b9e0b56bc8848 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_007.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_008.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_008.png new file mode 100644 index 0000000000000000000000000000000000000000..d8e26f3caac47508fc455b97f9ea8e56e00e6825 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_008.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_009.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_009.png new file mode 100644 index 0000000000000000000000000000000000000000..674801eebfe4fdd599e70105b18977989e78a1c8 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_009.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_010.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_010.png new file mode 100644 index 0000000000000000000000000000000000000000..5aff94384232dbabd98e58cede441542f3bd7bcc Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_010.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_011.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_011.png new file mode 100644 index 0000000000000000000000000000000000000000..804c9cca37462c1c855289b070c19c51e61c0d5e Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_011.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_012.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_012.png new file mode 100644 index 0000000000000000000000000000000000000000..eaef540baf1f6b3cb27649b15c12cdce10c66084 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_012.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_013.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_013.png new file mode 100644 index 0000000000000000000000000000000000000000..302214a40cded551d4b435b0050ee26c88dc8003 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_013.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_014.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_014.png new file mode 100644 index 0000000000000000000000000000000000000000..5de5bf691bea408df4620582f2b5b7678b35f47c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_014.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_015.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_015.png new file mode 100644 index 0000000000000000000000000000000000000000..45b8ad150255dd6833376c5299de688d7a084d9c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_015.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_016.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_016.png new file mode 100644 index 0000000000000000000000000000000000000000..b3d51a9387e144f2e3d7f22c6f74ab98edb78e32 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_016.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_017.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_017.png new file mode 100644 index 0000000000000000000000000000000000000000..aa80e4bb6554d67e9959229d01a199b31a554ab4 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_017.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_018.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_018.png new file mode 100644 index 0000000000000000000000000000000000000000..979b07053828df381b4a7a4c3e6b2fc33e79beff Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_018.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_019.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_019.png new file mode 100644 index 0000000000000000000000000000000000000000..666982dcccdf3891783fc37a147f42420285aea2 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_019.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_020.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_020.png new file mode 100644 index 0000000000000000000000000000000000000000..123eb8b181e658fa32215cd39fb73f506bb43be5 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_020.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_021.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_021.png new file mode 100644 index 0000000000000000000000000000000000000000..45837a01467835d059fcb97bac7c0187b671fcdf Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_021.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_022.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_022.png new file mode 100644 index 0000000000000000000000000000000000000000..a3e576ec4f9d4a0c9667b4d3e56573f59a33ea04 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_022.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_023.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_023.png new file mode 100644 index 0000000000000000000000000000000000000000..e7333010b5cedddd719fff0cdad1b299d396206e Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/rgb/frame_023.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/segmentation/segmentation_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/segmentation/segmentation_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..0f3fded51b603dbedbf929179b636439de482012 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/original_data/segmentation/segmentation_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fb62d445b2e21b46216cef048dbb0a397042dc9efff4bf92b00eae84a97f8be +size 12800 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/predictions.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/predictions.npz new file mode 100644 index 0000000000000000000000000000000000000000..77264ce87eee2e6336f0e9fed5a10b1e130bf04e --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/predictions.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57af34bb3acf7170e593480406de110da2852174cd725e9cda9f6c8dcd5c3805 +size 125040 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/targets.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/targets.npz new file mode 100644 index 0000000000000000000000000000000000000000..c67848e8da75e2953777bacd5f819f77ff8afd39 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_1/targets.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2946d50fe6fe7d758bc9cdcad2e5b8bc5b48d2b6f8523777d97494eb2fba889 +size 116304 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/error_statistics.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/error_statistics.json new file mode 100644 index 0000000000000000000000000000000000000000..42cad08a9e68fae19678ce61ec66c57fd967c433 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/error_statistics.json @@ -0,0 +1,4 @@ +{ + "object_mae": 1.4326672554016113, + "world_mae": 2.9999434136552736 +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/info.txt b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/info.txt new file mode 100644 index 0000000000000000000000000000000000000000..770e25a64f636d894df368ce042d9ef55106802a --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/info.txt @@ -0,0 +1,10 @@ +Text: large gray rubber cube, large blue rubber sphere, large red rubber cube, large blue metal sphere +Generated at step: 5 +Number of frames: 24 +Sequence: sample_00002 + +--- Model Output Summary --- +Max objects: 10 +Object parameters: 15 (exists + shape[2] + scale[3] + translation[3] + rotation[3] + velocity[3]) +World parameters: 8 (camera_pos[3] + camera_quat[4] + scene_scale[1]) +Physics parameters: 3 (mass + friction + restitution) diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/Full_Sample_Data_for_Learning_Target.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/Full_Sample_Data_for_Learning_Target.npz new file mode 100644 index 0000000000000000000000000000000000000000..1280d45c591c355d2d88794f598f2b4b61fa058d --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/Full_Sample_Data_for_Learning_Target.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7209406440b65e254a52c346f76264842adeb90da16f677dc54fad2f89ccd3e7 +size 35628 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/camera_trajectory.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/camera_trajectory.npz new file mode 100644 index 0000000000000000000000000000000000000000..be2cb661797c93175d5b992df4c67afb858c1db3 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/camera_trajectory.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bef4bb76f82f4597d56c6f9b16b5416b669bd5564198114384b8323224f8a8eb +size 450 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/depth/depth_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/depth/depth_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..4abecc7f232ed8eac463261c4c900368c5a0cbf9 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/depth/depth_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1783d38e7dfaa31fd9b12bce2d881e7b9a773a71f431362b2f6d57aa2b54abc +size 2196598 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/file_manifest.txt b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/file_manifest.txt new file mode 100644 index 0000000000000000000000000000000000000000..03ab0f7958bc9abb4c599cd122993922cec6c4bd --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/file_manifest.txt @@ -0,0 +1,38 @@ +Original sequence: sample_00002 +Data split: validation +Original path: ../data/movi_a_128x128/validation/sample_00002 +Copied at: 2025-12-05T18:42:56.007135 + +Files included: +- Full_Sample_Data_for_Learning_Target.npz +- camera_trajectory.npz +- depth/depth_merge.npz +- metadata.json +- normal/normal_merge.npz +- object_coordinates/object_coordinates_merge.npz +- point_clouds/point_clouds_merge.npz +- rgb/frame_000.png +- rgb/frame_001.png +- rgb/frame_002.png +- rgb/frame_003.png +- rgb/frame_004.png +- rgb/frame_005.png +- rgb/frame_006.png +- rgb/frame_007.png +- rgb/frame_008.png +- rgb/frame_009.png +- rgb/frame_010.png +- rgb/frame_011.png +- rgb/frame_012.png +- rgb/frame_013.png +- rgb/frame_014.png +- rgb/frame_015.png +- rgb/frame_016.png +- rgb/frame_017.png +- rgb/frame_018.png +- rgb/frame_019.png +- rgb/frame_020.png +- rgb/frame_021.png +- rgb/frame_022.png +- rgb/frame_023.png +- segmentation/segmentation_merge.npz diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/metadata.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7d76e8bd91096b0cae742527d3419f4bd82801b9 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/metadata.json @@ -0,0 +1,93 @@ +{ + "num_frames": 24, + "num_instances": 4, + "resolution": 128, + "depth_range": [ + 6.737051010131836, + 66.9015121459961 + ], + "camera": { + "focal_length": 35.0, + "sensor_width": 32.0, + "field_of_view": 0.8575560450553894, + "K": [ + [ + 140.0, + 0.0, + 64.0 + ], + [ + 0.0, + 140.0, + 64.0 + ], + [ + 0.0, + 0.0, + 1.0 + ] + ] + }, + "instances": [ + { + "id": 1, + "shape": "cube", + "size": "large", + "color": "gray", + "color_rgb": [ + 0.34117648005485535, + 0.34117648005485535, + 0.34117648005485535 + ], + "material": "rubber", + "mass": 2.999082326889038, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 2, + "shape": "sphere", + "size": "large", + "color": "blue", + "color_rgb": [ + 0.16470588743686676, + 0.29411765933036804, + 0.843137264251709 + ], + "material": "rubber", + "mass": 1.5771139860153198, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 3, + "shape": "cube", + "size": "large", + "color": "red", + "color_rgb": [ + 0.6784313917160034, + 0.13725490868091583, + 0.13725490868091583 + ], + "material": "rubber", + "mass": 2.999082326889038, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 4, + "shape": "sphere", + "size": "large", + "color": "blue", + "color_rgb": [ + 0.16470588743686676, + 0.29411765933036804, + 0.843137264251709 + ], + "material": "metal", + "mass": 3.871098041534424, + "friction": 0.4000000059604645, + "restitution": 0.30000001192092896 + } + ] +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/normal/normal_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/normal/normal_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..a45ef16d0a5f5b285eac1507a78558e320494d1d --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/normal/normal_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95dfc818abeb09aef7c44b3bab5aa21aab7c1c811ad27e86287db64167742449 +size 257391 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/object_coordinates/object_coordinates_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/object_coordinates/object_coordinates_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..d8d29d357c433a16e1a4aa44dfb059943e777770 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/object_coordinates/object_coordinates_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:136a3ba6a80e742fba77f27bd2b8f4fac400d46e2d36208c02e28a200a802574 +size 1653625 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/point_clouds/point_clouds_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/point_clouds/point_clouds_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..7c61b3f15b27bc7134b3d6ce9a8d8c2d3a76b68c --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/point_clouds/point_clouds_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd1943f93573498b8db7de7b3f38f5d1a08219bc2fc8205d64d7ef4b2c5395f +size 8951357 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_000.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_000.png new file mode 100644 index 0000000000000000000000000000000000000000..04a337d3f7f1c05b7a2beb69cb80d94d5aeac2bf Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_000.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_001.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_001.png new file mode 100644 index 0000000000000000000000000000000000000000..2b96099b94448df8d488075bccdb116ec4a2fc35 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_001.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_002.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_002.png new file mode 100644 index 0000000000000000000000000000000000000000..9cec2fe4c04f758b04c763036d304be755d37cc0 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_002.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_003.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_003.png new file mode 100644 index 0000000000000000000000000000000000000000..8024d223dd9b5d87e8d8c398e90dae1c89c37337 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_003.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_004.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_004.png new file mode 100644 index 0000000000000000000000000000000000000000..1923827e273f6deb4bd47cf1e6cdc6c9417fe46a Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_004.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_005.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_005.png new file mode 100644 index 0000000000000000000000000000000000000000..91e1ddaa7f4d081be05d055afa313b09e1454980 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_005.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_006.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_006.png new file mode 100644 index 0000000000000000000000000000000000000000..41a3f76323fbe02ba9f3280492918607cc79dc1d Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_006.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_007.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_007.png new file mode 100644 index 0000000000000000000000000000000000000000..e502a91e12ea17e1c34e703aa1ab8f177a128eda Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_007.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_008.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_008.png new file mode 100644 index 0000000000000000000000000000000000000000..c9854d9fa576def1b930d0447ef522312bbf6483 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_008.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_009.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_009.png new file mode 100644 index 0000000000000000000000000000000000000000..55660155274c6adf44c7f4e690cde339f2d5c91d Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_009.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_010.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_010.png new file mode 100644 index 0000000000000000000000000000000000000000..d32e1b1c62299145d0e848b9c51385e9cb642416 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_010.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_011.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_011.png new file mode 100644 index 0000000000000000000000000000000000000000..f55c758998c2e726bfb33dac37c1033bfc2235e6 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_011.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_012.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_012.png new file mode 100644 index 0000000000000000000000000000000000000000..17f4ba5d9461742f99a7c559c317a985344a6e07 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_012.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_013.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_013.png new file mode 100644 index 0000000000000000000000000000000000000000..fbef1cda52665616cd12f38ee977240c2e18aba4 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_013.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_014.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_014.png new file mode 100644 index 0000000000000000000000000000000000000000..919fad4af52f81819a2158b96cbb47f38302dbab Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_014.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_015.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_015.png new file mode 100644 index 0000000000000000000000000000000000000000..e0a53764ed70ead2dd1924fb49fa008f8c54ac06 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_015.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_016.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_016.png new file mode 100644 index 0000000000000000000000000000000000000000..e527cb4f3e23a729516589e2e39bf033d7976982 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_016.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_017.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_017.png new file mode 100644 index 0000000000000000000000000000000000000000..4f120984e1fdc0308c15ace902b330f5d98bfbb8 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_017.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_018.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_018.png new file mode 100644 index 0000000000000000000000000000000000000000..7243c37d91b897e9a468d0528c4cafe54c00e0d3 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_018.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_019.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_019.png new file mode 100644 index 0000000000000000000000000000000000000000..992be3e81b7d2e243e1d87059517c8e40b375dc4 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_019.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_020.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_020.png new file mode 100644 index 0000000000000000000000000000000000000000..49907b0059811c0da6ffc17f2c7b555e798fb29e Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_020.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_021.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_021.png new file mode 100644 index 0000000000000000000000000000000000000000..a0f34295ca103c3c89d897019f516e271f242ae5 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_021.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_022.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_022.png new file mode 100644 index 0000000000000000000000000000000000000000..9ff1030b1780c227651ed0fe85d60d76916c962b Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_022.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_023.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_023.png new file mode 100644 index 0000000000000000000000000000000000000000..68e6ccb57ca7cad477cc632efa84883bb4532b27 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/rgb/frame_023.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/segmentation/segmentation_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/segmentation/segmentation_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..28ddfca4ecdfb85917de14c15c3e5e254702e1f3 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/original_data/segmentation/segmentation_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b2d76d7cb49c452678c48a2670f77d6ec0aa2aea6b4d13249bcc40e3db4c54 +size 11950 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/predictions.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/predictions.npz new file mode 100644 index 0000000000000000000000000000000000000000..45890afefceea86397b180a45572eb1b0a1ccd9f --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/predictions.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4068f49ca720302a45f9765c33883de2a441a5d7a84a5b41b3afe0b040f1fea6 +size 124464 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/targets.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/targets.npz new file mode 100644 index 0000000000000000000000000000000000000000..61e0964860613a0afd4ee34d3e0df9b28da7524f --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_2/targets.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:088a948d372a742ba03fb34abe7e17e58e5ced40bd9d6f59cc1941d1da7c3bd2 +size 115728 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/error_statistics.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/error_statistics.json new file mode 100644 index 0000000000000000000000000000000000000000..9e99d4515dd0f8962d1f875dc759dfa72e8d6fcf --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/error_statistics.json @@ -0,0 +1,4 @@ +{ + "object_mae": 1.3683991432189941, + "world_mae": 2.9717430254277133 +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/info.txt b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/info.txt new file mode 100644 index 0000000000000000000000000000000000000000..7aa3b62d9837c6ee596ef2216b3639da8727b2af --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/info.txt @@ -0,0 +1,10 @@ +Text: large yellow metal cylinder, small red rubber cube, small yellow metal sphere +Generated at step: 5 +Number of frames: 24 +Sequence: sample_00003 + +--- Model Output Summary --- +Max objects: 10 +Object parameters: 15 (exists + shape[2] + scale[3] + translation[3] + rotation[3] + velocity[3]) +World parameters: 8 (camera_pos[3] + camera_quat[4] + scene_scale[1]) +Physics parameters: 3 (mass + friction + restitution) diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/Full_Sample_Data_for_Learning_Target.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/Full_Sample_Data_for_Learning_Target.npz new file mode 100644 index 0000000000000000000000000000000000000000..20ba43faebf15c9857d1d1ab214c83ab9fea1e9e --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/Full_Sample_Data_for_Learning_Target.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e4ce245fe78fb32898c0b10fb15ad2af8dafca7563d36e1acbf5f578f2fd88 +size 31492 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/camera_trajectory.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/camera_trajectory.npz new file mode 100644 index 0000000000000000000000000000000000000000..ba679729839fc4ff6cbf5f5423d30c38f9b4b5ac --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/camera_trajectory.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63880a646522d49910b1defbde394203bbe47cac5dc5105d619e9b65d8c35b57 +size 450 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/depth/depth_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/depth/depth_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..1d54b07dc4863b6ef4a7f1cadb8ac1a077c90213 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/depth/depth_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44545b6a7dcccf82ba4e21e4713ab88b7c59de87b54c4503b420ec558aaf5514 +size 2280936 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/file_manifest.txt b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/file_manifest.txt new file mode 100644 index 0000000000000000000000000000000000000000..67511cd607f069662806911672380535f66813ff --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/file_manifest.txt @@ -0,0 +1,38 @@ +Original sequence: sample_00003 +Data split: validation +Original path: ../data/movi_a_128x128/validation/sample_00003 +Copied at: 2025-12-05T18:42:57.529885 + +Files included: +- Full_Sample_Data_for_Learning_Target.npz +- camera_trajectory.npz +- depth/depth_merge.npz +- metadata.json +- normal/normal_merge.npz +- object_coordinates/object_coordinates_merge.npz +- point_clouds/point_clouds_merge.npz +- rgb/frame_000.png +- rgb/frame_001.png +- rgb/frame_002.png +- rgb/frame_003.png +- rgb/frame_004.png +- rgb/frame_005.png +- rgb/frame_006.png +- rgb/frame_007.png +- rgb/frame_008.png +- rgb/frame_009.png +- rgb/frame_010.png +- rgb/frame_011.png +- rgb/frame_012.png +- rgb/frame_013.png +- rgb/frame_014.png +- rgb/frame_015.png +- rgb/frame_016.png +- rgb/frame_017.png +- rgb/frame_018.png +- rgb/frame_019.png +- rgb/frame_020.png +- rgb/frame_021.png +- rgb/frame_022.png +- rgb/frame_023.png +- segmentation/segmentation_merge.npz diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/metadata.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..b0aaf15ba838883272b8f17464f87ac9e0810d98 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/metadata.json @@ -0,0 +1,78 @@ +{ + "num_frames": 24, + "num_instances": 3, + "resolution": 128, + "depth_range": [ + 6.550915718078613, + 56.188716888427734 + ], + "camera": { + "focal_length": 35.0, + "sensor_width": 32.0, + "field_of_view": 0.8575560450553894, + "K": [ + [ + 140.0, + 0.0, + 64.0 + ], + [ + 0.0, + 140.0, + 64.0 + ], + [ + 0.0, + 0.0, + 1.0 + ] + ] + }, + "instances": [ + { + "id": 1, + "shape": "cylinder", + "size": "large", + "color": "yellow", + "color_rgb": [ + 1.0, + 0.9333333373069763, + 0.019607843831181526 + ], + "material": "metal", + "mass": 5.784790992736816, + "friction": 0.4000000059604645, + "restitution": 0.30000001192092896 + }, + { + "id": 2, + "shape": "cube", + "size": "small", + "color": "red", + "color_rgb": [ + 0.6784313917160034, + 0.13725490868091583, + 0.13725490868091583 + ], + "material": "rubber", + "mass": 0.37488529086112976, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 3, + "shape": "sphere", + "size": "small", + "color": "yellow", + "color_rgb": [ + 1.0, + 0.9333333373069763, + 0.019607843831181526 + ], + "material": "metal", + "mass": 0.483887255191803, + "friction": 0.4000000059604645, + "restitution": 0.30000001192092896 + } + ] +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/normal/normal_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/normal/normal_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..14b829044e482c5c660230c12155acb582b305b0 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/normal/normal_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7023720e6b3894dd19b975f4a8a324dfddcc9919e585afb03980295848ea699d +size 99190 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/object_coordinates/object_coordinates_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/object_coordinates/object_coordinates_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..5d2f78c1d2f35dda8e153c7e7f1d15b978e832dc --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/object_coordinates/object_coordinates_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f1fb6816b94022d636cd898ff2ae4ef10821b7c48bbb5c0017d07885d34c49e +size 1603876 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/point_clouds/point_clouds_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/point_clouds/point_clouds_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..3b119357c2487a959e32cb2d2968cd079387712e --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/point_clouds/point_clouds_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37a425c04a56e4bc299b4d13ead2202a9d5151c0c0fee76335156b1b62d4b822 +size 8481567 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_000.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_000.png new file mode 100644 index 0000000000000000000000000000000000000000..607183f0838599595a16a3ee1c3489afe3c87f9c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_000.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_001.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_001.png new file mode 100644 index 0000000000000000000000000000000000000000..34a6037fe93765009e4482fe1adcdfb167a46b3c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_001.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_002.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_002.png new file mode 100644 index 0000000000000000000000000000000000000000..f45df30f253f88020f00fa3ced9f3447cffe75e6 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_002.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_003.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_003.png new file mode 100644 index 0000000000000000000000000000000000000000..c99007718fbfb317de1e6b141b96d905a1bf6f26 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_003.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_004.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_004.png new file mode 100644 index 0000000000000000000000000000000000000000..bfbb5153dcf726c89acf42175cf27a733a25798e Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_004.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_005.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_005.png new file mode 100644 index 0000000000000000000000000000000000000000..6fc506fbac2076f4a1b5c877aafb523b2a0a18b3 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_005.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_006.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_006.png new file mode 100644 index 0000000000000000000000000000000000000000..9ad01236c0398aa6319c65d817c37b3d05eeba6c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_006.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_007.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_007.png new file mode 100644 index 0000000000000000000000000000000000000000..51cebf54e5dcd39ce3c77b375630fadc9f339476 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_007.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_008.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_008.png new file mode 100644 index 0000000000000000000000000000000000000000..c0989f5808ef8ba019b7df22af46bc502841a32a Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_008.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_009.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_009.png new file mode 100644 index 0000000000000000000000000000000000000000..ee1e891754be1efef88ab8514f9b9c901777fd7a Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_009.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_010.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_010.png new file mode 100644 index 0000000000000000000000000000000000000000..938f9be70a0e084ca63c8910c2157a21ffe7244c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_010.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_011.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_011.png new file mode 100644 index 0000000000000000000000000000000000000000..06de643d7231c3ef56fc5b0ef374d086eb9968a5 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_011.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_012.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_012.png new file mode 100644 index 0000000000000000000000000000000000000000..18a5b1d8af42d0d292ce710ffeb30ed3f78dfdb3 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_012.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_013.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_013.png new file mode 100644 index 0000000000000000000000000000000000000000..e5427a61438e874271e4e8b51ec37e583ad27832 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_013.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_014.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_014.png new file mode 100644 index 0000000000000000000000000000000000000000..376c34072b5b928622fa8c85ca8707664a5c091f Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_014.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_015.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_015.png new file mode 100644 index 0000000000000000000000000000000000000000..157489aa40678f6ccc261bc20ad2546737b3621c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_015.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_016.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_016.png new file mode 100644 index 0000000000000000000000000000000000000000..a32eca8fb55b3716bfba7f0a90ef6f37cc48d338 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_016.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_017.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_017.png new file mode 100644 index 0000000000000000000000000000000000000000..979772c2383ca750d135cc378f9b08cdedbb8fc0 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_017.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_018.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_018.png new file mode 100644 index 0000000000000000000000000000000000000000..3ed77a6bf358e16d91b976470934965aa320edef Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_018.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_019.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_019.png new file mode 100644 index 0000000000000000000000000000000000000000..cf8cd16933a14b6ffd3452b0063fd0f96128066e Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_019.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_020.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_020.png new file mode 100644 index 0000000000000000000000000000000000000000..20970fe55949f892d1324f999a059b55e12f4282 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_020.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_021.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_021.png new file mode 100644 index 0000000000000000000000000000000000000000..783408cf6284e7890d08790d2ced0a9ea1e6744c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_021.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_022.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_022.png new file mode 100644 index 0000000000000000000000000000000000000000..7924163ddaccaef75462cd79ceb74af26814f06c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_022.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_023.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_023.png new file mode 100644 index 0000000000000000000000000000000000000000..d476099fd012b5039d2020e4e2472aff0eca579f Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/rgb/frame_023.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/segmentation/segmentation_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/segmentation/segmentation_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..6a9aef48cbb631a36a7147f43392504e9d401e05 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/original_data/segmentation/segmentation_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7dd5ccd009056f9d8de97459e6e3c2bb164eb2d8e0b706ae73b01a79213891c +size 8716 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/predictions.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/predictions.npz new file mode 100644 index 0000000000000000000000000000000000000000..54df4d69299d5a033b5a4c9e28b0917a75f8f44c --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/predictions.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:303562e2b3ef5b41727ba4b7c7adc5f9a146b86146b02e1f441b39a682162867 +size 124388 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/targets.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/targets.npz new file mode 100644 index 0000000000000000000000000000000000000000..e62738a0ff189a6ccb1578fcd32f442a0979e5a6 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_3/targets.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca2167a30d7e74575aaf48af99f21fb79f0630e258a2ff3802487d60ad2d35ca +size 115652 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/error_statistics.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/error_statistics.json new file mode 100644 index 0000000000000000000000000000000000000000..644e8ac16020417122ca91667c01b4dc64674928 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/error_statistics.json @@ -0,0 +1,4 @@ +{ + "object_mae": 1.3517074584960938, + "world_mae": 2.922047750295557 +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/info.txt b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/info.txt new file mode 100644 index 0000000000000000000000000000000000000000..4857ea6c0844f4833a860218d4fb53d2a04c064d --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/info.txt @@ -0,0 +1,10 @@ +Text: large gray rubber sphere, small cyan metal cylinder, small red rubber sphere +Generated at step: 5 +Number of frames: 24 +Sequence: sample_00004 + +--- Model Output Summary --- +Max objects: 10 +Object parameters: 15 (exists + shape[2] + scale[3] + translation[3] + rotation[3] + velocity[3]) +World parameters: 8 (camera_pos[3] + camera_quat[4] + scene_scale[1]) +Physics parameters: 3 (mass + friction + restitution) diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/Full_Sample_Data_for_Learning_Target.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/Full_Sample_Data_for_Learning_Target.npz new file mode 100644 index 0000000000000000000000000000000000000000..df0c063d3e9991cb72f4e5063a0386ab4421d8c0 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/Full_Sample_Data_for_Learning_Target.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd71813aa5a950a0919facb03e16908af63cd221e199c2ac2be3d1b3c994ad8a +size 32808 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/camera_trajectory.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/camera_trajectory.npz new file mode 100644 index 0000000000000000000000000000000000000000..79dbe282e1d6344beaf66f97e5e4e5af49295b44 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/camera_trajectory.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7629c60b74dea0c2a90f44ea426db8500eb0ea1185089ae953450012b107280c +size 450 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/depth/depth_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/depth/depth_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..e1b851e0bcd046c8b3f586d317c561198d402aaf --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/depth/depth_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:819d926d04d57d0da6a37a249aed38da23044958e509c8411bb1fa292684e598 +size 2332494 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/file_manifest.txt b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/file_manifest.txt new file mode 100644 index 0000000000000000000000000000000000000000..3252d91b9087daca5c238156540d3cf1db4320ff --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/file_manifest.txt @@ -0,0 +1,38 @@ +Original sequence: sample_00004 +Data split: validation +Original path: ../data/movi_a_128x128/validation/sample_00004 +Copied at: 2025-12-05T18:42:58.356189 + +Files included: +- Full_Sample_Data_for_Learning_Target.npz +- camera_trajectory.npz +- depth/depth_merge.npz +- metadata.json +- normal/normal_merge.npz +- object_coordinates/object_coordinates_merge.npz +- point_clouds/point_clouds_merge.npz +- rgb/frame_000.png +- rgb/frame_001.png +- rgb/frame_002.png +- rgb/frame_003.png +- rgb/frame_004.png +- rgb/frame_005.png +- rgb/frame_006.png +- rgb/frame_007.png +- rgb/frame_008.png +- rgb/frame_009.png +- rgb/frame_010.png +- rgb/frame_011.png +- rgb/frame_012.png +- rgb/frame_013.png +- rgb/frame_014.png +- rgb/frame_015.png +- rgb/frame_016.png +- rgb/frame_017.png +- rgb/frame_018.png +- rgb/frame_019.png +- rgb/frame_020.png +- rgb/frame_021.png +- rgb/frame_022.png +- rgb/frame_023.png +- segmentation/segmentation_merge.npz diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/metadata.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5fbddac5c42903254ec5411551da7f4f1fcc7f --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/metadata.json @@ -0,0 +1,78 @@ +{ + "num_frames": 24, + "num_instances": 3, + "resolution": 128, + "depth_range": [ + 5.7761054039001465, + 46.58609390258789 + ], + "camera": { + "focal_length": 35.0, + "sensor_width": 32.0, + "field_of_view": 0.8575560450553894, + "K": [ + [ + 140.0, + 0.0, + 64.0 + ], + [ + 0.0, + 140.0, + 64.0 + ], + [ + 0.0, + 0.0, + 1.0 + ] + ] + }, + "instances": [ + { + "id": 1, + "shape": "sphere", + "size": "large", + "color": "gray", + "color_rgb": [ + 0.34117648005485535, + 0.34117648005485535, + 0.34117648005485535 + ], + "material": "rubber", + "mass": 1.5771139860153198, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + }, + { + "id": 2, + "shape": "cylinder", + "size": "small", + "color": "cyan", + "color_rgb": [ + 0.16078431904315948, + 0.8156862854957581, + 0.8156862854957581 + ], + "material": "metal", + "mass": 0.723098874092102, + "friction": 0.4000000059604645, + "restitution": 0.30000001192092896 + }, + { + "id": 3, + "shape": "sphere", + "size": "small", + "color": "red", + "color_rgb": [ + 0.6784313917160034, + 0.13725490868091583, + 0.13725490868091583 + ], + "material": "rubber", + "mass": 0.19713924825191498, + "friction": 0.800000011920929, + "restitution": 0.699999988079071 + } + ] +} \ No newline at end of file diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/normal/normal_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/normal/normal_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..c599590958daff9a94dfb23bd72d86dcaa05f786 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/normal/normal_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fddf59b33aa6735b443b01f072b31b14a3cd02f2dc6fb26b8754c90f2c2cbd1f +size 91612 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/object_coordinates/object_coordinates_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/object_coordinates/object_coordinates_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..fe23feb1cfa9edd96500323aae9d66da28ffb258 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/object_coordinates/object_coordinates_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59ed558c1ed48e206d7fef98dd5d3c7dbce17569eec23b8dcd4cdc0e29bbaf71 +size 1603715 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/point_clouds/point_clouds_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/point_clouds/point_clouds_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..20ec5962dd2cf70109d19f1b6289eae3ecdc8702 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/point_clouds/point_clouds_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:361c572d15f6013c92ada33b6477b9b1e94c3996880d8bed4c2df65212f37723 +size 8535397 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_000.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_000.png new file mode 100644 index 0000000000000000000000000000000000000000..3ba2eac9678a082d311275e853c166eece131a42 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_000.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_001.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_001.png new file mode 100644 index 0000000000000000000000000000000000000000..30f022ab35e0f1d14b961082c4490255488e8fd8 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_001.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_002.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_002.png new file mode 100644 index 0000000000000000000000000000000000000000..09a04968889c71303dafbd38654929e8aacafb10 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_002.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_003.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_003.png new file mode 100644 index 0000000000000000000000000000000000000000..68d33f0989349807213200a92907673a009c2208 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_003.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_004.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_004.png new file mode 100644 index 0000000000000000000000000000000000000000..38eb3de0ad7d7af8c80ff720add3e792b7b7234a Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_004.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_005.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_005.png new file mode 100644 index 0000000000000000000000000000000000000000..4f01363a0c91511a671201b8de6ae7ab3d9701c4 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_005.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_006.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_006.png new file mode 100644 index 0000000000000000000000000000000000000000..2fd56740866052a9b03cff1a2bd49f2fc912ece7 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_006.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_007.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_007.png new file mode 100644 index 0000000000000000000000000000000000000000..af99a6fb22eb445526b4fc03d082d4c006dd3d6f Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_007.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_008.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_008.png new file mode 100644 index 0000000000000000000000000000000000000000..360b1450cf016bc01f0dd191eb052051a8f98eb9 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_008.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_009.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_009.png new file mode 100644 index 0000000000000000000000000000000000000000..aac655c922559f2c3e6bda4fd5356922c2ac8da1 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_009.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_010.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_010.png new file mode 100644 index 0000000000000000000000000000000000000000..22abf0daf4a18e6adefb6bed488d5b938a906b81 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_010.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_011.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_011.png new file mode 100644 index 0000000000000000000000000000000000000000..a83514ddf275b3e411e479db0d0223c6e8728948 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_011.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_012.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_012.png new file mode 100644 index 0000000000000000000000000000000000000000..1563c389c39e0f2ac8a64e7a3c31291cfc089951 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_012.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_013.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_013.png new file mode 100644 index 0000000000000000000000000000000000000000..7503b7c56640bfc9c874062866f962083efd538c Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_013.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_014.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_014.png new file mode 100644 index 0000000000000000000000000000000000000000..5fefe10fac16ad105a70d27b8a7a3829ddcc9bc1 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_014.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_015.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_015.png new file mode 100644 index 0000000000000000000000000000000000000000..877c421114704cfd4c56488de75ac360d671de8f Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_015.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_016.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_016.png new file mode 100644 index 0000000000000000000000000000000000000000..b0b4ca7212714283c7bddfddce5b227b0bdbc6eb Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_016.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_017.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_017.png new file mode 100644 index 0000000000000000000000000000000000000000..9b50b5280aea6f063b6d19c05b5d00858497a494 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_017.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_018.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_018.png new file mode 100644 index 0000000000000000000000000000000000000000..2e690fed93f5b1f90463e3e180b7bfab7bf12af0 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_018.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_019.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_019.png new file mode 100644 index 0000000000000000000000000000000000000000..7974d9824c7d56126be8825e74bbe03b72d94844 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_019.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_020.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_020.png new file mode 100644 index 0000000000000000000000000000000000000000..ed00fcca7cd626999941ce3cc99b4ffbf90f75b7 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_020.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_021.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_021.png new file mode 100644 index 0000000000000000000000000000000000000000..6e4113a4f065150add38ba627ddd20c7bd74a5df Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_021.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_022.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_022.png new file mode 100644 index 0000000000000000000000000000000000000000..3f13c6a102a34f87a74ae21eb76ce81e779cfad3 Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_022.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_023.png b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_023.png new file mode 100644 index 0000000000000000000000000000000000000000..cbaee736699c7a796259c71ce375a29edbe53b3f Binary files /dev/null and b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/rgb/frame_023.png differ diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/segmentation/segmentation_merge.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/segmentation/segmentation_merge.npz new file mode 100644 index 0000000000000000000000000000000000000000..86ae5f278b246c86ff1d9bbea37050304de7040d --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/original_data/segmentation/segmentation_merge.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5535cb97eaa1bacaa385414da9f0c928036b33e44e2e4461d87152cfeb5dfb5a +size 8589 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/predictions.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/predictions.npz new file mode 100644 index 0000000000000000000000000000000000000000..8727a7a6a5610843dbe6a4bb7242e63ff01c0efd --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/predictions.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc320304c861971d1e0fa49469fa842f1c3c45312c99a6f65bd5b9befb8fc65a +size 124384 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/targets.npz b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/targets.npz new file mode 100644 index 0000000000000000000000000000000000000000..7b4adb7b218e98c92ffbbeaf8e6ebd257519dcad --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/sample_4/targets.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91cf32962dd5c412a52a200558b902dadd4647e9e1aeb2c4dddb349f2a7e476b +size 115648 diff --git a/nano_WaveGen/core_space/20251205_184253_step5_text2wave/save_config.json b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/save_config.json new file mode 100644 index 0000000000000000000000000000000000000000..120fb07fcc5880ee060b38e7869a81464b122b76 --- /dev/null +++ b/nano_WaveGen/core_space/20251205_184253_step5_text2wave/save_config.json @@ -0,0 +1,7 @@ +{ + "enabled": true, + "save_gt": true, + "fixed_samples": 5, + "save_interval": 100, + "save_dir": "core_space" +} \ No newline at end of file diff --git a/nano_WaveGen/env/requirements_WaveGen.txt b/nano_WaveGen/env/requirements_WaveGen.txt new file mode 100644 index 0000000000000000000000000000000000000000..262536d182eb6fe922d81268b01e36e64044979e --- /dev/null +++ b/nano_WaveGen/env/requirements_WaveGen.txt @@ -0,0 +1,56 @@ +# Please update setup.py when modifying this file + +# WaveGen basic +accelerate>=1.2.0 +clip@https://github.com/openai/CLIP/tarball/master#egg=clip-1.0.0 +diffusers>=0.31.0 +matplotlib>=3.9.3 +numpy==1.26.4 +packaging +Pillow +plotly +scikit_learn +scipy +sympy +timm>=1.0.12 +torch>=2.5.0 +torchvision>=0.20.0 +tqdm +transformers>=4.36.0 +ipykernel +nbformat>=4.2.0 +mayavi +numba +wandb +scikit_learn +numba +plyfile +sentencepiece + + +# ControlWave +gradio>=4.0.0 +pathlib +plotly +viser +open3d +fastapi +uvicorn + + +# wave2pixel +trimesh +pyrender +SpeechRecognition +whisper +bpy +lightning[pytorch-extra]<2.6,>=2.1 +pandas==2.2.3 +pyrender +xFormers +pytorch-msssim +transformers + + +#jupyter +ipywidgets \ No newline at end of file diff --git a/nano_WaveGen/launch_text2wave_training.sh b/nano_WaveGen/launch_text2wave_training.sh new file mode 100644 index 0000000000000000000000000000000000000000..e5556e9ef11669c8f0c0d5cfb28d9523378fbc25 --- /dev/null +++ b/nano_WaveGen/launch_text2wave_training.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash + +# Relaunch with bash if executed via sh/dash +if [ -z "$BASH_VERSION" ]; then + exec /bin/bash "$0" "$@" +fi + +# Launch training with accelerate +# Usage: +# bash launch_text2wave_training.sh # Start new training +# bash launch_text2wave_training.sh 1000 # Resume from step 1000 +# python train_text2wave.py --help +# +# Note: Generation saving is controlled by configs/default.yaml (enabled by default) + +# Configuration +export OMP_NUM_THREADS=8 +SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +WAVEGEN_ROOT="$( cd -- "$SCRIPT_DIR/.." &> /dev/null && pwd )" +PROJECT_ROOT="$( cd -- "$SCRIPT_DIR/../.." &> /dev/null && pwd )" +export PYTHONPATH="$SCRIPT_DIR:$WAVEGEN_ROOT:$PROJECT_ROOT:${PYTHONPATH}" + +# Helper: read GPU request and fallback list from config +readarray -t GPU_CONFIG <<< "$(python - <<'PY' +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import yaml +try: + with open('configs/default.yaml', 'r') as f: + cfg = yaml.safe_load(f) or {} + gpu_list = cfg.get('training', {}).get('gpu_list') + if isinstance(gpu_list, (list, tuple)) and gpu_list: + gpu_list = [int(g) for g in gpu_list] + print(len(gpu_list)) + print(','.join(map(str, gpu_list))) + else: + print(0) + print('') +except Exception: + print(0) + print('') +PY +)" + +REQUESTED_GPU_COUNT=${GPU_CONFIG[0]:-0} +CONFIG_GPU_LIST=${GPU_CONFIG[1]} + +if [ "$REQUESTED_GPU_COUNT" -le 0 ]; then + REQUESTED_GPU_COUNT=1 +fi + +if [ -n "$CUDA_VISIBLE_DEVICES" ]; then + echo "CUDA_VISIBLE_DEVICES preset to $CUDA_VISIBLE_DEVICES" + SELECTED_GPUS=$CUDA_VISIBLE_DEVICES +else + export REQUESTED_GPU_COUNT + export CONFIG_GPU_LIST + SELECTED_GPUS=$(python - <<'PY' +import os +import sys +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from utils.gpu_utils import select_gpus + +requested = int(os.environ.get('REQUESTED_GPU_COUNT', '1')) +threshold_env = os.environ.get('GPU_IDLE_MEMORY_THRESHOLD') +threshold = int(threshold_env) if threshold_env else None + +fallback_env = os.environ.get('CONFIG_GPU_LIST', '') +fallback = [int(x) for x in fallback_env.split(',') if x.strip()] or None + +selected = select_gpus(requested, threshold, fallback) +print(','.join(str(i) for i in selected)) +PY +) +fi + +if [ -z "$SELECTED_GPUS" ]; then + echo "Error: Unable to select any available GPU." + exit 1 +fi + +NUM_GPUS=$(echo "$SELECTED_GPUS" | tr ',' '\n' | wc -l) +if [ "$NUM_GPUS" -lt "$REQUESTED_GPU_COUNT" ]; then + echo "Warning: Requested $REQUESTED_GPU_COUNT GPU(s) but only $NUM_GPUS detected/selected." +fi + +export CUDA_VISIBLE_DEVICES=$SELECTED_GPUS + +# Detect CPU concurrency for preprocessing (default use all cores, cap at 48 to avoid overload) +CPU_TOTAL=$(nproc) +if [ -z "$PREPROCESS_NUM_WORKERS" ]; then + if [ "$CPU_TOTAL" -gt 48 ]; then + PREPROCESS_NUM_WORKERS=48 + else + PREPROCESS_NUM_WORKERS=$CPU_TOTAL + fi +fi +echo "Using ${PREPROCESS_NUM_WORKERS} CPU workers for preprocessing (total cores: ${CPU_TOTAL})" + +# Create output directory +mkdir -p core_space + +# Check if accelerate config exists +if [ ! -f configs/accelerate_config.yaml ]; then + echo "Error: configs/accelerate_config.yaml not found!" + echo "Please ensure accelerate config exists in the configs directory." + exit 1 +fi + +# Determine how many samples to preprocess based on config +MAX_SAMPLES=$(python - <<'PY' +import yaml +try: + with open('configs/default.yaml', 'r') as f: + cfg = yaml.safe_load(f) + value = cfg.get('data', {}).get('max_sequences', -1) + if value in (None, -1): + print(-1) + else: + print(int(value)) +except Exception as exc: + print(-1) +PY +) + +echo "Checking dataset cache status... (max_samples=${MAX_SAMPLES})" +# Preprocess training split +python ../data/preprocess_dataset.py \ + --data_root ../data/movi_a_128x128 \ + --split train \ + --max_samples ${MAX_SAMPLES} \ + --num_workers ${PREPROCESS_NUM_WORKERS} + +# Also preprocess validation set (matching limit) +python ../data/preprocess_dataset.py \ + --data_root ../data/movi_a_128x128 \ + --split validation \ + --max_samples ${MAX_SAMPLES} \ + --num_workers ${PREPROCESS_NUM_WORKERS} + +echo "Dataset preprocessing complete." + +# Parse command line arguments +RESUME_STEP="" + +# Check for resume step argument +if [ ! -z "$1" ]; then + if [[ "$1" =~ ^[0-9]+$ ]]; then + RESUME_STEP="--resume_step $1" + echo "Resuming training from step $1" + fi +fi + +# Note: Generation saving is now controlled by config file (enabled by default) + +# Launch training +echo "Starting training on $(date)" +echo "Using GPUs: $CUDA_VISIBLE_DEVICES" +LAUNCH_ARGS=( + --config_file configs/accelerate_config.yaml + --num_processes "$NUM_GPUS" + --mixed_precision bf16 +) + +# 注意:T5模型原本是用bfloat16训练的,使用fp16会导致NaN +# 如需启用混合精度,建议使用bf16而不是fp16 +accelerate launch \ + "${LAUNCH_ARGS[@]}" \ + train_text2wave.py \ + --train_config configs/default.yaml \ + --data_root ../data/movi_a_128x128 \ + --output_dir core_space \ + $RESUME_STEP + +echo "Training completed on $(date)" diff --git a/nano_WaveGen/train_text2wave.py b/nano_WaveGen/train_text2wave.py new file mode 100644 index 0000000000000000000000000000000000000000..107996fe57edfcdd69778ce292b2395465de2dd1 --- /dev/null +++ b/nano_WaveGen/train_text2wave.py @@ -0,0 +1,1451 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader +from transformers import LongT5ForConditionalGeneration, T5ForConditionalGeneration, T5Tokenizer +from accelerate import Accelerator +from accelerate.utils import set_seed +from concurrent.futures import ThreadPoolExecutor +import numpy as np +from pathlib import Path +import yaml +from tqdm import tqdm +from typing import Dict, List, Tuple, Optional +import argparse +import os +import re +import warnings +from collections import defaultdict +import time +from datetime import datetime +import sys +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + +# Add parent directory to path to find data and utils modules +SCRIPT_DIR = Path(__file__).resolve().parent +WAVEGEN_ROOT = SCRIPT_DIR.parent +if str(WAVEGEN_ROOT) not in sys.path: + sys.path.insert(0, str(WAVEGEN_ROOT)) + +# Suppress the specific transformers warning about past_key_values +warnings.filterwarnings("ignore", message="Passing a tuple of `past_key_values` is deprecated") + +from data.movi_dataset import create_dataloader +from utils.save_generation_results import save_generation_results + + +class Text2WaveModel(nn.Module): + """Text to Superquadric Wave Parameters Model""" + + def __init__( + self, + model_name: str = "google/long-t5-tglobal-base", + max_objects: int = 10, + num_frames: int = 24, + max_history_frames: int = 3, + random_history_sampling: bool = True, + decoder_noise_std: float = 0.0, + ): + super().__init__() + + self.max_objects = max_objects + self.num_frames = num_frames + self.max_history_frames = max_history_frames + self.random_history_sampling = random_history_sampling + self.decoder_noise_std = float(decoder_noise_std) + # exists(1) + shape(2) + scale(3) + translation(3) + rotation(3) + velocity(3) + self.object_param_dim = 15 + + # Load appropriate T5-family model (LongT5 for large checkpoints, vanilla T5 for smaller variants) + self.model_name = model_name + self.is_longt5 = "long-t5" in model_name.lower() + self.tokenizer = T5Tokenizer.from_pretrained(model_name) + if self.is_longt5: + self.t5_model = LongT5ForConditionalGeneration.from_pretrained(model_name) + else: + self.t5_model = T5ForConditionalGeneration.from_pretrained(model_name) + + # Resize model embeddings to match tokenizer if needed + if self.tokenizer.vocab_size != self.t5_model.config.vocab_size: + self.t5_model.resize_token_embeddings(self.tokenizer.vocab_size) + + # Get T5 hidden size + self.hidden_size = self.t5_model.config.d_model + + # Output projection layers + # Object parameters: exists + shape[2] + scale[3] + translation[3] + rotation[3] + velocity[3] + self.object_proj = nn.Linear(self.hidden_size, max_objects * self.object_param_dim) + + # World parameters: camera_pos(3) + camera_quat(4) + scene_scale(1) = 8 + self.world_proj = nn.Linear(self.hidden_size, 8) + + # Physics parameters: mass(1) + friction(1) + restitution(1) = 3 + self.physics_proj = nn.Linear(self.hidden_size, max_objects * 3) + + # Relative time embedding + self.time_embed = nn.Linear(1, self.hidden_size) + + # History embedding (autoregressive context up to max_history_frames) + history_feature_dim = max_history_frames * (max_objects * self.object_param_dim + 8) + max_objects * 3 + self.history_feature_dim = history_feature_dim + self.history_proj = nn.Linear(history_feature_dim, self.hidden_size) + + + # Initialize weights with small values to prevent NaN + self._init_weights() + + def _init_weights(self): + """Initialize weights for stability""" + # Very small initialization for output projections + for module in [self.object_proj, self.world_proj, self.physics_proj]: + nn.init.normal_(module.weight, mean=0.0, std=0.02) + nn.init.zeros_(module.bias) + + # Time embedding initialization + nn.init.normal_(self.time_embed.weight, mean=0.0, std=0.02) + nn.init.zeros_(self.time_embed.bias) + + # History embedding initialization + nn.init.normal_(self.history_proj.weight, mean=0.0, std=0.02) + nn.init.zeros_(self.history_proj.bias) + + def _initialize_history_state( + self, + history_frames: Optional[Dict[str, torch.Tensor]], + batch_size: int, + device: torch.device, + ) -> Tuple[List[Dict[str, torch.Tensor]], torch.Tensor]: + """Prepare history buffer and physics state for autoregressive decoding.""" + history_buffer: List[Dict[str, torch.Tensor]] = [] + + physics_state = torch.zeros( + batch_size, + self.max_objects, + 3, + device=device, + dtype=torch.float32, + ) + + if history_frames is not None: + objects_hist = history_frames.get('objects') + world_hist = history_frames.get('world') + physics_hist = history_frames.get('physics') + + if physics_hist is not None: + physics_state = physics_hist.to(device=device, dtype=torch.float32) + + if objects_hist is not None and world_hist is not None: + history_len = objects_hist.shape[1] + for idx in range(history_len): + history_buffer.append({ + 'objects': objects_hist[:, idx, :, :self.object_param_dim].to(device=device, dtype=torch.float32), + 'world': world_hist[:, idx, :8].to(device=device, dtype=torch.float32), + }) + + if len(history_buffer) == 0: + history_buffer.append({ + 'objects': torch.zeros(batch_size, self.max_objects, self.object_param_dim, device=device), + 'world': torch.zeros(batch_size, 8, device=device), + }) + + history_buffer = history_buffer[-self.max_history_frames:] + + return history_buffer, physics_state + + def sample_decoder_noise(self, batch_size: int, device: torch.device) -> Optional[torch.Tensor]: + """Sample decoder noise embedding when noise std > 0.""" + if self.decoder_noise_std <= 0: + return None + noise = torch.randn(batch_size, self.hidden_size, device=device) + return noise * self.decoder_noise_std + + def _build_history_embedding( + self, + history_buffer: List[Dict[str, torch.Tensor]], + physics_state: torch.Tensor, + use_frames: int, + ) -> torch.Tensor: + """Convert most recent history frames into conditioning embedding.""" + batch_size = physics_state.shape[0] + device = physics_state.device + + frame_dim = self.max_objects * self.object_param_dim + 8 + history_tensor = torch.zeros( + batch_size, + self.max_history_frames * frame_dim, + device=device, + ) + + use_frames = min(use_frames, self.max_history_frames) + recent_frames = history_buffer[-use_frames:] if use_frames > 0 else [] + for slot, frame in enumerate(recent_frames): + offset = slot * frame_dim + obj_flat = frame['objects'].reshape(batch_size, -1) + world_feat = frame['world'] + history_tensor[:, offset:offset + obj_flat.shape[1]] = obj_flat + history_tensor[:, offset + obj_flat.shape[1]:offset + frame_dim] = world_feat + + physics_flat = physics_state.reshape(batch_size, -1) + history_features = torch.cat([history_tensor, physics_flat], dim=-1) + return self.history_proj(history_features) + + def forward( + self, + input_text: List[str], + target_frames: torch.Tensor, # [batch, num_target_frames, ...] + history_frames: Optional[Dict[str, torch.Tensor]] = None, # History context (objects/world/physics) + relative_times: torch.Tensor = None, # [batch, num_target_frames] + static_object_params: Optional[torch.Tensor] = None, # Optional static params to enforce (exists+shape+scale) + noise: Optional[torch.Tensor] = None, # Optional additive noise for decoder embeddings + ): + """ + Forward pass for text to wave parameter generation + + Args: + input_text: List of text descriptions + target_frames: Target frame indices to predict + history_frames: Optional history frames for conditioning + relative_times: Relative time positions [-1, 1] for each target frame + """ + batch_size = len(input_text) + num_target_frames = target_frames.shape[1] + + # Format input text for T5 task + # Use standard T5 format for text-to-text generation + formatted_text = [f"translate to wave: {text}" for text in input_text] + + # Tokenize input text + text_inputs = self.tokenizer( + formatted_text, + padding=True, + truncation=True, + max_length=512, + return_tensors="pt" + ).to(target_frames.device) + + + # Encode text with T5 + try: + # First, let's try a simple forward pass with dummy decoder input + # T5 expects decoder_input_ids starting with pad token + decoder_start_token_id = self.t5_model.config.pad_token_id + decoder_input_ids = torch.full( + (batch_size, 1), + decoder_start_token_id, + dtype=torch.long, + device=text_inputs.input_ids.device + ) + + # Try using the full model forward pass + outputs = self.t5_model( + input_ids=text_inputs.input_ids, + attention_mask=text_inputs.attention_mask, + decoder_input_ids=decoder_input_ids, + return_dict=True, + output_hidden_states=True + ) + + encoder_outputs = outputs.encoder_last_hidden_state + except Exception as e: + if 'log_message' in globals(): + log_message(f"ERROR in encoder: {e}") + else: + print(f"ERROR in encoder: {e}") + raise + + # Autoregressive decoding with history conditioning + history_buffer, physics_state = self._initialize_history_state( + history_frames, + batch_size, + target_frames.device, + ) + + if static_object_params is not None: + static_object_params = static_object_params.to( + device=target_frames.device, + dtype=torch.float32, + ) + + if noise is not None: + noise = noise.to(device=encoder_outputs.device, dtype=encoder_outputs.dtype) + + outputs = [] + + for f in range(num_target_frames): + if self.random_history_sampling: + max_available = min(len(history_buffer), self.max_history_frames) + if max_available > 0: + use_history = int(torch.randint( + low=0, + high=max_available + 1, + size=(1,), + device=encoder_outputs.device, + ).item()) + else: + use_history = 0 + else: + use_history = min(len(history_buffer), self.max_history_frames) + + if relative_times is not None: + time_input = relative_times[:, f:f+1].unsqueeze(-1) + time_embed = self.time_embed(time_input).squeeze(1) + else: + time_embed = torch.zeros( + batch_size, + self.hidden_size, + device=encoder_outputs.device, + ) + + history_embed = self._build_history_embedding(history_buffer, physics_state, use_history) + decoder_embed = time_embed + history_embed + if noise is not None: + decoder_embed = decoder_embed + noise + + decoder_output = self.t5_model.decoder( + inputs_embeds=decoder_embed.unsqueeze(1), # [batch, 1, hidden_size] + encoder_hidden_states=encoder_outputs, + encoder_attention_mask=text_inputs.attention_mask, + ) + + hidden = decoder_output.last_hidden_state[:, 0] # [batch, hidden_size] + + object_params = self.object_proj(hidden).view(batch_size, self.max_objects, self.object_param_dim) + if static_object_params is not None: + # Preserve the first 6 dimensions (exists + shape + scale) from provided static parameters + static_slice = static_object_params[:, :, :6] + if static_slice.shape[-1] < 6: + pad_width = 6 - static_slice.shape[-1] + pad = torch.zeros(*static_slice.shape[:-1], pad_width, device=object_params.device) + static_slice = torch.cat([static_slice, pad], dim=-1) + object_params = object_params.clone() + object_params[:, :, :6] = static_slice + world_params = self.world_proj(hidden) + physics_params = self.physics_proj(hidden).view(batch_size, self.max_objects, 3) + + outputs.append({ + 'objects': object_params, + 'world': world_params, + 'physics': physics_params, + }) + + history_buffer.append({ + 'objects': object_params, + 'world': world_params, + }) + if len(history_buffer) > self.max_history_frames: + history_buffer = history_buffer[-self.max_history_frames:] + + physics_state = physics_params + + return outputs + + +class BidirectionalTrainer: + """Trainer for bidirectional prediction from middle frame""" + + def __init__( + self, + model: Text2WaveModel, + config: Dict, + accelerator: Accelerator, + ): + self.model = model + self.config = config + self.accelerator = accelerator + base_model = accelerator.unwrap_model(model) if hasattr(accelerator, "unwrap_model") else model + self.object_param_dim = getattr(base_model, "object_param_dim", 12) + self.freeze_static_params = bool(config['training'].get('freeze_static_from_anchor', True)) + self.base_model = base_model + self.sample_attempts = int(config['training'].get('multi_sample_attempts', 1)) + self.sample_attempts = max(1, self.sample_attempts) + + # Loss functions + self.world_loss_fn = nn.MSELoss() + self.physics_loss_fn = nn.MSELoss() + + # Loss weights from config + loss_weights_config = config.get('loss', {}).get('weights', {}) + self.loss_weights = { + 'wave_loss(superquadric)': loss_weights_config.get('wave_loss', 1.0), + 'wave_contrastive_loss': loss_weights_config.get('wave_contrastive_loss', 2.0), + 'world_info_loss(camera,scale,time)': loss_weights_config.get('world_info_loss', 0.5), + 'controllable_info_loss(mass,friction,restitution)': loss_weights_config.get('controllable_info_loss', 0.1), + 'pla_loss': loss_weights_config.get('pla_loss', 3.0), + } + + physics_cfg = config.get('physics', {}) + self.gravity = float(physics_cfg.get('gravity', 9.81)) + self.collision_buffer = float(physics_cfg.get('collision_buffer', 1.05)) + + # Temporal configuration (dataset cached at 8 fps by default) + self.frame_rate = float(config['training'].get('frame_rate', 8.0)) + self.frame_rate = max(self.frame_rate, 1e-6) + + presence_cfg = config.get('loss', {}).get('wave_presence', {}) + self.wave_count_weight = float(presence_cfg.get('count_weight', 0.2)) + self.wave_presence_threshold = float(presence_cfg.get('scale_threshold', 0.1)) + self.wave_presence_temperature = float(presence_cfg.get('temperature', 0.1)) + contrastive_cfg = config.get('loss', {}).get('wave_contrastive', {}) + self.wave_contrastive_temperature = float(contrastive_cfg.get('temperature', 0.2)) + + # By convention the last three learnable slots before the inlier ratio store velocity + self.velocity_slice = slice(max(self.object_param_dim - 3, 0), self.object_param_dim) + + def compute_loss( + self, + predictions: List[Dict], + targets: Dict[str, torch.Tensor], + frame_indices: List[int], + ) -> Dict[str, torch.Tensor]: + """Compute losses for predicted frames""" + losses = { + 'wave_loss(superquadric)': 0.0, # Wave loss (superquadric parameters) + 'wave_contrastive_loss': 0.0, # Sequence-level contrastive alignment + 'world_info_loss(camera,scale,time)': 0.0, # World info loss (camera, scale, relative time) + 'controllable_info_loss(mass,friction,restitution)': 0.0, # Controllable info loss (mass, friction, restitution) + 'pla_loss': 0.0, # Physical plausibility regularizer + 'wave_count_mse': 0.0, # Count alignment between predicted and target waves + 'total': 0.0, + } + + pla_entries = [] + pred_summaries: List[torch.Tensor] = [] + target_summaries: List[torch.Tensor] = [] + + for i, (pred, frame_idx) in enumerate(zip(predictions, frame_indices)): + # Object loss (only for existing objects) + target_objects = targets['objects'][:, frame_idx] # [batch, max_objects, 16] + if target_objects.shape[-1] < self.object_param_dim: + pad_width = self.object_param_dim - target_objects.shape[-1] + pad = target_objects.new_zeros(*target_objects.shape[:-1], pad_width) + target_objects = torch.cat([target_objects, pad], dim=-1) + pred_objects = pred['objects'] # [batch, max_objects, self.object_param_dim] + + # Extract existence mask from target + exists_mask = target_objects[:, :, 0] > 0.5 # [batch, max_objects] + + target_core = target_objects[:, :, :self.object_param_dim] + + # Sequence-level reconstruction with velocity-aware weighting + object_loss = self._wave_reconstruction_loss(pred_objects, target_core, exists_mask) + losses['wave_loss(superquadric)'] += object_loss + + # Soft count alignment using scale magnitude as presence proxy + target_presence = target_objects[:, :, 0].float() + pred_scale_norm = torch.linalg.norm(pred_objects[:, :, 3:6], dim=-1) + presence_input = (pred_scale_norm - self.wave_presence_threshold) / max(self.wave_presence_temperature, 1e-6) + pred_presence = torch.sigmoid(presence_input) + pred_count = pred_presence.sum(dim=-1) + target_count = target_presence.sum(dim=-1) + count_mse = F.mse_loss(pred_count, target_count) + losses['wave_count_mse'] += count_mse + losses['wave_loss(superquadric)'] += self.wave_count_weight * count_mse + + pla_entries.append({ + 'frame_idx': frame_idx, + 'pred_objects': pred_objects, + 'exists_mask': exists_mask, + }) + + # Aggregate summaries for contrastive objective + mask = exists_mask.float().unsqueeze(-1) + # Avoid division by zero by clamping the counts before inversion + denom = mask.sum(dim=1).clamp_min(1.0) + pred_summary = (pred_objects * mask).sum(dim=1) / denom + target_summary = (target_core * mask).sum(dim=1) / denom + pred_summaries.append(pred_summary) + target_summaries.append(target_summary) + + # World loss + target_world = targets['world'][:, frame_idx] # [batch, 11] + pred_world = pred['world'] # [batch, 8] + + # Compare only first 8 dimensions + world_loss = self.world_loss_fn( + pred_world, + target_world[:, :8] + ) + losses['world_info_loss(camera,scale,time)'] += world_loss + + # Physics loss (constant across frames, use frame 0) + if i == 0: + target_physics = targets['physics'] # [batch, max_objects, 3] + pred_physics = pred['physics'] # [batch, max_objects, 3] + + physics_loss = self.physics_loss_fn( + pred_physics[exists_mask], + target_physics[exists_mask] + ) + losses['controllable_info_loss(mass,friction,restitution)'] = physics_loss + + # Average over frames + num_frames = len(predictions) + losses['wave_loss(superquadric)'] /= num_frames + losses['world_info_loss(camera,scale,time)'] /= num_frames + losses['wave_count_mse'] /= num_frames + + # Anchor PLA loss around the observed middle frame to provide a reference state + total_frames = targets['objects'].shape[1] + middle_idx = total_frames // 2 + anchor_objects = targets['objects'][:, middle_idx] + anchor_exists = anchor_objects[:, :, 0] > 0.5 + pla_entries.append({ + 'frame_idx': middle_idx, + 'pred_objects': anchor_objects[:, :, :self.object_param_dim].detach(), + 'exists_mask': anchor_exists, + }) + + # Physical regularizer + pla_loss = self._compute_pla_regularizer(pla_entries) + losses['pla_loss'] = pla_loss + + # Contrastive alignment between predicted and target trajectories + if pred_summaries: + pred_stack = torch.stack(pred_summaries, dim=0).mean(dim=0) + target_stack = torch.stack(target_summaries, dim=0).mean(dim=0) + losses['wave_contrastive_loss'] = self._contrastive_clip_loss(pred_stack, target_stack) + else: + device = targets['objects'].device + losses['wave_contrastive_loss'] = torch.zeros((), device=device) + + # Compute total loss + for key, weight in self.loss_weights.items(): + if key in losses: + losses['total'] += weight * losses[key] + + return losses + + def _wave_reconstruction_loss( + self, + pred_objects: torch.Tensor, + target_objects: torch.Tensor, + exists_mask: torch.Tensor, + ) -> torch.Tensor: + """Velocity-aware reconstruction loss combining position L1 and velocity L1.""" + device = pred_objects.device + dtype = pred_objects.dtype + if not exists_mask.any(): + return torch.zeros((), device=device, dtype=dtype) + + pred_active = pred_objects[exists_mask] + target_active = target_objects[exists_mask] + + base_l1 = F.l1_loss(pred_active, target_active, reduction='mean') + + if self.velocity_slice.start >= self.velocity_slice.stop: # degenerate slice when dim < 3 + velocity_l1 = torch.zeros((), device=device, dtype=dtype) + else: + pred_velocity = pred_active[..., self.velocity_slice] + target_velocity = target_active[..., self.velocity_slice] + velocity_l1 = F.l1_loss(pred_velocity, target_velocity, reduction='mean') + + return 0.5 * base_l1 + 0.5 * velocity_l1 + + def _contrastive_clip_loss( + self, + pred_summary: torch.Tensor, + target_summary: torch.Tensor, + ) -> torch.Tensor: + """InfoNCE-style contrastive loss between predicted and target clip summaries.""" + device = pred_summary.device + dtype = pred_summary.dtype + batch = pred_summary.size(0) + if batch <= 1: + return torch.zeros((), device=device, dtype=dtype) + + dim = min(pred_summary.size(-1), target_summary.size(-1)) + if dim == 0: + return torch.zeros((), device=device, dtype=dtype) + if pred_summary.size(-1) != dim: + pred_summary = pred_summary[..., :dim] + if target_summary.size(-1) != dim: + target_summary = target_summary[..., :dim] + + temperature = max(self.wave_contrastive_temperature, 1e-6) + pred_norm = F.normalize(pred_summary, dim=-1) + target_norm = F.normalize(target_summary, dim=-1) + dim_post = min(pred_norm.size(-1), target_norm.size(-1)) + if dim_post == 0: + return torch.zeros((), device=device, dtype=dtype) + if pred_norm.size(-1) != dim_post: + pred_norm = pred_norm[..., :dim_post] + if target_norm.size(-1) != dim_post: + target_norm = target_norm[..., :dim_post] + logits = pred_norm @ target_norm.transpose(0, 1) + logits = logits / temperature + + labels = torch.arange(batch, device=device) + loss_forward = F.cross_entropy(logits, labels) + loss_backward = F.cross_entropy(logits.transpose(0, 1), labels) + + return 0.5 * (loss_forward + loss_backward) + + def _compute_pla_regularizer(self, entries: List[Dict[str, torch.Tensor]]) -> torch.Tensor: + """Encourage rigid-body consistency, free-fall dynamics, and collision plausibility.""" + model_device = next(self.model.parameters()).device + if not entries: + return torch.tensor(0.0, device=model_device) + + # Sort by frame index to obtain temporal order + sorted_entries = sorted(entries, key=lambda x: x['frame_idx']) + + device = sorted_entries[0]['pred_objects'].device + dtype = sorted_entries[0]['pred_objects'].dtype + + preds = torch.stack([item['pred_objects'] for item in sorted_entries], dim=0) # [F, B, O, 12] + exists = torch.stack([item['exists_mask'].float() for item in sorted_entries], dim=0) # [F, B, O] + + frame_count, batch_size, max_objects, _ = preds.shape + + if frame_count <= 1: + return torch.tensor(0.0, device=device, dtype=dtype) + + exists_expanded = exists.unsqueeze(-1) + exists_total = exists_expanded.sum() + if exists_total.item() == 0: + return torch.tensor(0.0, device=device, dtype=dtype) + + # 1. Shape and scale invariance for rigid bodies + shape_params = preds[..., 1:3] + scale_params = preds[..., 3:6] + + shape_mean = (shape_params * exists_expanded).sum(dim=0) / exists_expanded.sum(dim=0).clamp_min(1.0) + scale_mean = (scale_params * exists_expanded).sum(dim=0) / exists_expanded.sum(dim=0).clamp_min(1.0) + + shape_loss = ((shape_params - shape_mean) ** 2 * exists_expanded).sum() / exists_expanded.sum().clamp_min(1.0) + scale_loss = ((scale_params - scale_mean) ** 2 * exists_expanded).sum() / exists_expanded.sum().clamp_min(1.0) + + # 2. Free-fall consistency via discrete Euler-Lagrange residuals + freefall_loss = torch.tensor(0.0, device=device, dtype=dtype) + rotation_loss = torch.tensor(0.0, device=device, dtype=dtype) + collision_penalty = torch.tensor(0.0, device=device, dtype=dtype) + velocity_loss = torch.tensor(0.0, device=device, dtype=dtype) + + positions = preds[..., 6:9] + + if frame_count >= 3: + radii = torch.linalg.norm(preds[..., 3:6], dim=-1) + + accel = positions[2:] - 2 * positions[1:-1] + positions[:-2] + + exists_triplet = exists[1:-1] * exists[:-2] * exists[2:] + exists_triplet_expanded = exists_triplet.unsqueeze(-1) + + # Collision detection to gate free-fall prior + center_positions = positions[1:-1].reshape(-1, max_objects, 3) + center_exists = exists[1:-1].reshape(-1, max_objects) + center_radii = radii[1:-1].reshape(-1, max_objects) + + if center_positions.numel() > 0: + dist = torch.cdist(center_positions, center_positions, p=2) # [N, O, O] + radius_sum = (center_radii.unsqueeze(-1) + center_radii.unsqueeze(-2)) * self.collision_buffer + exists_pair = center_exists.unsqueeze(-1) * center_exists.unsqueeze(-2) + + eye = torch.eye(max_objects, device=device).unsqueeze(0) + non_diag = (1 - eye) + + penetration = torch.relu((radius_sum - dist) * non_diag) * exists_pair + collision_penalty = penetration.pow(2).sum() / (non_diag * exists_pair).sum().clamp_min(1.0) + + contact_any = (penetration > 0).any(dim=-1).view(frame_count - 2, batch_size, max_objects) + else: + contact_any = torch.zeros(frame_count - 2, batch_size, max_objects, device=device, dtype=torch.bool) + + contact_mask = contact_any.float() + + gravity_vec = torch.tensor([0.0, 0.0, -self.gravity], device=device, dtype=dtype).view(1, 1, 1, 3) + residual = accel + gravity_vec + + freefall_mask = exists_triplet_expanded * (1.0 - contact_mask.unsqueeze(-1)) + valid_count = freefall_mask.sum().clamp_min(1.0) + freefall_loss = (residual.pow(2) * freefall_mask).sum() / valid_count + + rotations = preds[..., 9:12] + rot_sin = torch.sin(rotations) + rot_cos = torch.cos(rotations) + rot_features = torch.cat([rot_sin, rot_cos], dim=-1) + rot_acc = rot_features[2:] - 2 * rot_features[1:-1] + rot_features[:-2] + + rot_mask = exists_triplet_expanded * (1.0 - contact_mask.unsqueeze(-1)) + rot_valid = rot_mask.sum().clamp_min(1.0) + rotation_loss = (rot_acc.pow(2) * rot_mask).sum() / rot_valid + + if frame_count >= 2: + velocities = preds[..., 12:15] + diff = (positions[1:] - positions[:-1]) * self.frame_rate + exists_pair = exists[1:] * exists[:-1] + diff_expanded = exists_pair.unsqueeze(-1) + + velocity_residual = (velocities[1:] - diff).pow(2) * diff_expanded + valid_velocity = diff_expanded.sum() + velocity_loss = velocity_residual.sum() + + first_pair = (exists[0] * exists[1]).unsqueeze(-1) + velocity_loss += ((velocities[0] - diff[0]) ** 2 * first_pair).sum() + valid_velocity += first_pair.sum() + + velocity_loss = velocity_loss / valid_velocity.clamp_min(1.0) + + pla_loss = ( + shape_loss + + scale_loss + + freefall_loss + + rotation_loss + + collision_penalty + + velocity_loss + ) + return pla_loss + + def _select_anchor_frame(self, num_frames: int) -> int: + """Determine which frame should serve as the initial anchor.""" + cfg = self.config['training'].get('initial_frame', {}) + strategy = cfg.get('strategy', 'middle') + + if strategy == 'random': + base_idx = int(torch.randint(low=0, high=num_frames, size=(1,), device=torch.device('cpu')).item()) + elif strategy == 'fixed': + base_idx = int(cfg.get('index', num_frames // 2)) + else: + base_idx = num_frames // 2 + + offset = int(cfg.get('offset', 0)) + anchor_idx = base_idx + offset + anchor_idx = max(0, min(num_frames - 1, anchor_idx)) + return anchor_idx + + def _generate_full_sequence( + self, + text: List[str], + objects: torch.Tensor, + world: torch.Tensor, + physics: torch.Tensor, + teacher_prob: float, + anchor_idx: Optional[int] = None, + use_noise: bool = False, + ) -> Tuple[List[Dict[str, torch.Tensor]], List[int], float]: + """Generate a full sequence of predictions given an anchor frame.""" + batch_size, num_frames = objects.shape[:2] + if anchor_idx is None: + anchor_idx = self._select_anchor_frame(num_frames) + + static_object_params = None + if self.freeze_static_params: + anchor_static = objects[:, anchor_idx, :, :6] + static_object_params = anchor_static + + if teacher_prob > 0.0: + teacher_mask = (torch.rand(batch_size, device=objects.device) < teacher_prob).float() + else: + teacher_mask = torch.zeros(batch_size, device=objects.device, dtype=torch.float32) + + def sample_noise(): + return self.base_model.sample_decoder_noise(batch_size, objects.device) if use_noise else None + + half_span = max(num_frames - 1, 1) / 2.0 + inference_time = 0.0 + predictions_by_idx: Dict[int, Dict[str, torch.Tensor]] = {} + + anchor_rel_times = torch.zeros( + (batch_size, 1), dtype=torch.float32, device=objects.device + ) + anchor_targets = torch.full( + (batch_size, 1), anchor_idx, dtype=torch.long, device=objects.device + ) + + start = time.time() + anchor_preds = self.model( + input_text=text, + target_frames=anchor_targets, + history_frames=None, + relative_times=anchor_rel_times, + static_object_params=static_object_params, + noise=sample_noise(), + ) + inference_time += time.time() - start + anchor_pred = anchor_preds[0] + predictions_by_idx[anchor_idx] = anchor_pred + + anchor_gt_objects = objects[:, anchor_idx, :, :self.object_param_dim] + if anchor_gt_objects.shape[-1] < self.object_param_dim: + pad_width = self.object_param_dim - anchor_gt_objects.shape[-1] + pad = anchor_gt_objects.new_zeros(*anchor_gt_objects.shape[:-1], pad_width) + anchor_gt_objects = torch.cat([anchor_gt_objects, pad], dim=-1) + anchor_gt_world = world[:, anchor_idx, :8] + anchor_pred_objects = anchor_pred['objects'] + if static_object_params is not None: + anchor_pred_objects[:, :, :6] = static_object_params[:, :, :6] + anchor_pred_world = anchor_pred['world'] + + teacher_mask_objs = teacher_mask.view(batch_size, 1, 1) + teacher_mask_world = teacher_mask.view(batch_size, 1) + + blended_objects = anchor_pred_objects * (1.0 - teacher_mask_objs) + anchor_gt_objects * teacher_mask_objs + blended_world = anchor_pred_world * (1.0 - teacher_mask_world) + anchor_gt_world * teacher_mask_world + + history_objects = blended_objects.unsqueeze(1) + history_world = blended_world.unsqueeze(1) + history_physics = physics.clone() + + def make_history_seed(): + return { + 'objects': history_objects.clone(), + 'world': history_world.clone(), + 'physics': history_physics.clone(), + } + + backward_indices = list(range(anchor_idx - 1, -1, -1)) + forward_indices = list(range(anchor_idx + 1, num_frames)) + + def run_direction(target_indices: List[int]): + nonlocal inference_time + if not target_indices: + return + + rel_times = torch.tensor( + [(idx - anchor_idx) / half_span for idx in target_indices], + dtype=torch.float32, + device=objects.device, + ).unsqueeze(0).repeat(batch_size, 1) + + target_tensor = torch.tensor( + target_indices, + dtype=torch.long, + device=objects.device, + ).unsqueeze(0).repeat(batch_size, 1) + + history_frames = make_history_seed() + + start_time = time.time() + preds = self.model( + input_text=text, + target_frames=target_tensor, + history_frames=history_frames, + relative_times=rel_times, + static_object_params=static_object_params, + noise=sample_noise(), + ) + inference_time += time.time() - start_time + + for idx, pred in zip(target_indices, preds): + if static_object_params is not None: + pred['objects'][:, :, :6] = static_object_params[:, :, :6] + predictions_by_idx[idx] = pred + + run_direction(backward_indices) + run_direction(forward_indices) + + ordered_indices = list(range(num_frames)) + predictions = [predictions_by_idx[idx] for idx in ordered_indices] + return predictions, ordered_indices, inference_time + + def _compute_losses( + self, + batch: Dict[str, torch.Tensor], + ) -> Tuple[Dict[str, torch.Tensor], float, int]: + """Shared logic for computing losses and metadata.""" + text = batch['text'] + objects = batch['objects'] # [batch, num_frames, max_objects, 16] + world = batch['world'] # [batch, num_frames, 11] + physics = batch['physics'] # [batch, max_objects, 3] + + batch_size, num_frames = objects.shape[:2] + anchor_idx = self._select_anchor_frame(num_frames) + teacher_prob = float(self.config['training'].get('initial_teacher_forcing_prob', 0.5)) + + targets = { + 'objects': objects, + 'world': world, + 'physics': physics, + } + + attempts = self.sample_attempts if self.model.training else 1 + use_noise = attempts > 1 + best_losses: Optional[Dict[str, torch.Tensor]] = None + best_predictions: Optional[List[Dict[str, torch.Tensor]]] = None + best_frame_indices: Optional[List[int]] = None + best_inference_time: float = 0.0 + best_total_value: Optional[float] = None + + for attempt in range(attempts): + predictions, frame_indices, inference_time = self._generate_full_sequence( + text=text, + objects=objects, + world=world, + physics=physics, + teacher_prob=teacher_prob, + anchor_idx=anchor_idx, + use_noise=use_noise, + ) + + losses = self.compute_loss(predictions, targets, frame_indices) + total_value = float(losses['total'].detach()) + if best_total_value is None or total_value < best_total_value: + if best_losses is not None: + del best_losses + if best_predictions is not None: + del best_predictions + best_total_value = total_value + best_losses = losses + best_predictions = predictions + best_frame_indices = frame_indices + best_inference_time = inference_time + else: + del losses + del predictions + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + assert best_losses is not None and best_predictions is not None and best_frame_indices is not None + num_predicted_frames = len(best_predictions) + frames_per_second = num_predicted_frames / best_inference_time if best_inference_time > 0 else 0.0 + + return best_losses, frames_per_second, num_predicted_frames + + def train_step( + self, + batch: Dict[str, torch.Tensor], + step: int, + ) -> Dict[str, float]: + """Single training step with bidirectional prediction""" + self.model.train() + + losses, frames_per_second, num_predicted_frames = self._compute_losses(batch) + + self.accelerator.backward(losses['total']) + + loss_dict = {k: v.item() if torch.is_tensor(v) else float(v) for k, v in losses.items()} + loss_dict['inference_fps'] = frames_per_second + loss_dict['frames_predicted'] = num_predicted_frames + + return loss_dict + + def evaluate_batch(self, batch: Dict[str, torch.Tensor]) -> Dict[str, float]: + """Compute losses without gradient updates.""" + was_training = self.model.training + self.model.eval() + with torch.no_grad(): + losses, frames_per_second, num_predicted_frames = self._compute_losses(batch) + if was_training: + self.model.train() + + loss_dict = {k: v.item() if torch.is_tensor(v) else float(v) for k, v in losses.items()} + loss_dict['inference_fps'] = frames_per_second + loss_dict['frames_predicted'] = num_predicted_frames + return loss_dict + + + + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--train_config', type=str, default='configs/default.yaml', + help='Training configuration file') + parser.add_argument('--data_root', type=str, + default='../data/movi_a_128x128', + help='Root directory of MOVi dataset') + parser.add_argument('--output_dir', type=str, default='core_space', + help='Directory to save checkpoints and generation results') + parser.add_argument('--resume_step', type=int, default=None, + help='Resume training from specific step') + args = parser.parse_args() + + # Load training config + with open(args.train_config, 'r') as f: + config = yaml.safe_load(f) + + # Initialize accelerator with DDP configuration + from accelerate import DistributedDataParallelKwargs + + ddp_kwargs = DistributedDataParallelKwargs( + find_unused_parameters=True, + broadcast_buffers=False + ) + + # 注意:混合精度通过launch_text2wave_training.sh中的--mixed_precision参数控制 + # 如果遇到NaN问题,请确保shell脚本中没有启用mixed_precision + accelerator = Accelerator( + gradient_accumulation_steps=1, + kwargs_handlers=[ddp_kwargs] + ) + + # Set seed + set_seed(42) + + # Create model + model_name = config.get('text2wave_model', {}).get('model_name', "google/t5-v1_1-small") + model = Text2WaveModel( + model_name=model_name, + max_objects=10, + num_frames=24, + max_history_frames=config['training']['max_history_frames'], + random_history_sampling=config['training'].get('random_history_sampling', True), + decoder_noise_std=config['training'].get('decoder_noise_std', 0.0), + ) + + # Create optimizer + optimizer = torch.optim.AdamW( + model.parameters(), + lr=config['training']['learning_rate'], + weight_decay=0.01, + ) + + # Create dataloaders + train_dataloader = create_dataloader( + data_root=args.data_root, + split='train', + batch_size=config['training']['batch_size'], + num_workers=config['data']['num_workers'], + shuffle=True, + max_samples=config['data'].get('max_sequences', -1), + ) + + val_dataloader = create_dataloader( + data_root=args.data_root, + split='validation', + batch_size=config['training']['batch_size'], + num_workers=config['data']['num_workers'], + shuffle=False, + max_samples=10, # Use only 10 validation samples + ) + + # Prepare for distributed training + model, optimizer, train_dataloader, val_dataloader = accelerator.prepare( + model, optimizer, train_dataloader, val_dataloader + ) + + checkpoint_dir = Path("checkpoints_text2wave") + if accelerator.is_main_process: + checkpoint_dir.mkdir(parents=True, exist_ok=True) + + log_file_path = checkpoint_dir / "training_log.txt" + + def log_message(message: str): + """Log to stdout and append to training_log.txt from main process.""" + if not accelerator.is_main_process: + return + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + formatted = f"{timestamp} {message}" + print(formatted) + try: + with open(log_file_path, 'a') as fp: + fp.write(formatted + "\n") + except Exception: + pass + + best_metrics_path = checkpoint_dir / "best_metrics.json" + if best_metrics_path.exists(): + try: + best_metrics_path.unlink() + except OSError as exc: + log_message(f"Warning: failed to remove legacy best_metrics.json due to {exc}") + + best_train_loss = float('inf') + best_val_loss = float('inf') + + evaluation_cfg = config['training'].get('evaluation', {}) + eval_max_batches = evaluation_cfg.get('max_batches', 5) + + training_stats_path = checkpoint_dir / "training_stats.npz" + loaded_step_history: Optional[List[int]] = None + loaded_loss_history: Dict[str, List[float]] = {} + if training_stats_path.exists(): + try: + stats = np.load(training_stats_path, allow_pickle=True) + best_train_loss = float(stats.get('best_train_loss', best_train_loss)) + best_val_loss = float(stats.get('best_val_loss', best_val_loss)) + if 'step_history' in stats: + loaded_step_history = stats['step_history'].tolist() + if 'loss_history_keys' in stats and 'loss_history_values' in stats: + keys = stats['loss_history_keys'].tolist() + values = stats['loss_history_values'].tolist() + for key, value in zip(keys, values): + loaded_loss_history[str(key)] = list(np.asarray(value, dtype=float)) + except Exception as exc: + log_message(f"Warning: failed to load training_stats.npz due to {exc}") + + executor = ThreadPoolExecutor(max_workers=1) + pending_futures: List = [] + + def cleanup_futures(): + pending_futures[:] = [f for f in pending_futures if not f.done()] + + def submit_task(fn, *args, **kwargs): + cleanup_futures() + future = executor.submit(fn, *args, **kwargs) + pending_futures.append(future) + return future + + def recursive_to_cpu(obj): + if isinstance(obj, torch.Tensor): + return obj.detach().cpu() + if isinstance(obj, dict): + return {k: recursive_to_cpu(v) for k, v in obj.items()} + if isinstance(obj, list): + return [recursive_to_cpu(v) for v in obj] + if isinstance(obj, tuple): + return tuple(recursive_to_cpu(v) for v in obj) + return obj + + def save_checkpoint_async(path: Path, payload: Dict): + def _task(): + torch.save(payload, path) + submit_task(_task) + + def save_generation_async(predictions: List[Dict], targets: Dict[str, torch.Tensor], texts: List[str], step: int, save_config: Dict, metadata: Dict, batch_data: Dict, data_root: str, data_split: str): + def _task(): + save_generation_results( + predictions=predictions, + targets=targets, + texts=texts, + step=step, + output_dir=args.output_dir, + save_config=save_config, + metadata=metadata, + batch_data=batch_data, + data_root=data_root, + data_split=data_split + ) + submit_task(_task) + + def compute_validation_loss(max_batches: Optional[int]) -> Optional[float]: + limit = -1 if max_batches is None else max_batches + if limit == 0: + return None + total = 0.0 + count = 0 + for batch_idx, val_batch in enumerate(val_dataloader): + val_losses = trainer.evaluate_batch(val_batch) + total += val_losses['total'] + count += 1 + if limit > 0 and (batch_idx + 1) >= limit: + break + if count == 0: + return None + return total / count + + # Create trainer + trainer = BidirectionalTrainer(model, config, accelerator) + + # Get max_steps from config + max_steps = config['training']['max_steps'] + + # Calculate and display dataset traversal information + if accelerator.is_main_process: + steps_per_epoch = len(train_dataloader) + total_epochs = max_steps / steps_per_epoch + log_message("=" * 60) + log_message("Dataset Information:") + log_message(f"- Training samples: {len(train_dataloader.dataset) if hasattr(train_dataloader, 'dataset') else 'N/A'}") + log_message(f"- Batch size: {config['training']['batch_size']}") + log_message(f"- Steps per epoch (full dataset): {steps_per_epoch}") + log_message(f"- Total training steps: {max_steps}") + log_message(f"- Will traverse dataset: {total_epochs:.2f} times") + log_message("=" * 60) + + # Resume from checkpoint if specified + start_step = 0 + resumed_from = None + if args.resume_step is not None: + checkpoint_path = checkpoint_dir / f"step{args.resume_step}.pt" + if checkpoint_path.exists(): + log_message(f"Resuming from checkpoint step {args.resume_step}") + checkpoint = torch.load(checkpoint_path, map_location='cpu') + accelerator.unwrap_model(model).load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + start_step = checkpoint.get('step', args.resume_step) + resumed_from = checkpoint_path + else: + log_message(f"Warning: Checkpoint for step {args.resume_step} not found, starting from scratch") + else: + latest_checkpoint_path = checkpoint_dir / "latest.pt" + if latest_checkpoint_path.exists(): + try: + log_message("Resuming from latest checkpoint") + checkpoint = torch.load(latest_checkpoint_path, map_location='cpu') + accelerator.unwrap_model(model).load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + start_step = checkpoint.get('step', 0) + resumed_from = latest_checkpoint_path + except Exception as exc: + log_message(f"Warning: failed to load latest checkpoint due to {exc}; attempting best checkpoint") + try: + corrupt_path = latest_checkpoint_path.with_suffix(latest_checkpoint_path.suffix + ".corrupt") + latest_checkpoint_path.rename(corrupt_path) + log_message(f"Renamed corrupt latest checkpoint to {corrupt_path.name}") + except Exception as rename_exc: + log_message(f"Warning: could not rename corrupt latest checkpoint: {rename_exc}") + if resumed_from is None: + best_checkpoint_path = checkpoint_dir / "best.pt" + if best_checkpoint_path.exists(): + try: + log_message("Resuming from best checkpoint") + checkpoint = torch.load(best_checkpoint_path, map_location='cpu') + accelerator.unwrap_model(model).load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + start_step = checkpoint.get('step', 0) + resumed_from = best_checkpoint_path + except Exception as exc: + log_message(f"Warning: failed to load best checkpoint due to {exc}; starting from scratch") + + # Setup local logging and plotting + log_dir = checkpoint_dir + loss_history = defaultdict(list) + step_history: List[int] = [] + if loaded_step_history: + step_history.extend(int(s) for s in loaded_step_history) + if loaded_loss_history: + for key, values in loaded_loss_history.items(): + loss_history[key].extend(values) + last_plot_time = time.time() + plot_path = log_dir / "losses.png" + + def save_training_stats(): + if not accelerator.is_main_process: + return + keys = sorted(loss_history.keys()) + loss_arrays = [np.array(loss_history[k], dtype=np.float32) for k in keys] + np.savez( + training_stats_path, + best_train_loss=best_train_loss, + best_val_loss=best_val_loss, + step_history=np.array(step_history, dtype=np.int64), + loss_history_keys=np.array(keys, dtype=object), + loss_history_values=np.array(loss_arrays, dtype=object), + ) + + def update_loss_plot(): + if not accelerator.is_main_process or not step_history: + return + x_values = np.array(step_history, dtype=np.int64) + keys = [k for k, v in sorted(loss_history.items()) if v] + if not keys: + return + + def align_series(series: List[float]) -> np.ndarray: + y_vals = np.array(series, dtype=np.float32) + if len(y_vals) > len(x_values): + y_vals = y_vals[-len(x_values):] + elif len(y_vals) < len(x_values): + pad = np.full(len(x_values) - len(y_vals), np.nan, dtype=np.float32) + y_vals = np.concatenate([pad, y_vals]) + return y_vals + + fig_height = 3 * (len(keys) + 1) + fig, axes = plt.subplots(len(keys) + 1, 1, figsize=(10, fig_height), sharex=True) + if not isinstance(axes, np.ndarray): + axes = np.array([axes]) + + cmap = plt.get_cmap('tab10', len(keys)) + + aggregated_ax = axes[0] + aggregated_ax.set_title("Training Losses (all)") + aggregated_ax.set_ylabel("Loss") + aggregated_ax.grid(True, alpha=0.3) + + for idx, key in enumerate(keys): + y_aligned = align_series(loss_history[key]) + if np.all(np.isnan(y_aligned)): + continue + color = cmap(idx % cmap.N) + aggregated_ax.plot(x_values, y_aligned, label=key, color=color) + ax = axes[idx + 1] + ax.plot(x_values, y_aligned, color=color) + ax.set_ylabel(key) + ax.grid(True, alpha=0.3) + + axes[-1].set_xlabel("Step") + aggregated_ax.legend() + fig.tight_layout() + fig.savefig(plot_path) + plt.close(fig) + save_training_stats() + + if accelerator.is_main_process and step_history: + update_loss_plot() + + # Training loop + global_step = start_step + + with tqdm(total=max_steps, initial=start_step, disable=not accelerator.is_local_main_process, position=0, leave=True) as pbar: + while global_step < max_steps: + for batch in train_dataloader: + # Training step + losses = trainer.train_step(batch, global_step) + + # Update progress + if accelerator.is_local_main_process: + pbar.update(1) + # Add fps info to losses for display + display_losses = losses.copy() + display_losses['fps'] = losses['inference_fps'] + pbar.set_postfix(display_losses) + + # Print step info to create a log history + loss_str = f"Step {global_step}: " + for k, v in losses.items(): + if k not in ['inference_fps', 'frames_predicted']: + loss_str += f"{k}={v:.4f} " + loss_str += f"| {losses['frames_predicted']} frames @ {losses['inference_fps']:.1f} fps (training speed, inference faster)" + tqdm.write(loss_str) + + + if accelerator.is_main_process: + step_history.append(global_step) + for k, v in losses.items(): + if k in ['inference_fps', 'frames_predicted']: + continue + loss_history[k].append(v) + current_time = time.time() + if current_time - last_plot_time >= 10: + update_loss_plot() + last_plot_time = current_time + + # Save checkpoint and generation results + # Save at step 5 for testing, then at regular intervals + save_condition = (global_step == 5) or (global_step > 0 and global_step % config['training']['save_generation']['save_interval'] == 0) + if save_condition: + if accelerator.is_main_process: + generation_save_dir = Path(args.output_dir) + generation_save_dir.mkdir(parents=True, exist_ok=True) + + current_train_loss = losses['total'] + val_loss = compute_validation_loss(eval_max_batches) + + model_state = recursive_to_cpu(accelerator.get_state_dict(model)) + optimizer_state = recursive_to_cpu(optimizer.state_dict()) + payload = { + 'step': global_step, + 'model_state_dict': model_state, + 'optimizer_state_dict': optimizer_state, + 'config': config, + } + + latest_checkpoint_path = checkpoint_dir / "latest.pt" + save_checkpoint_async(latest_checkpoint_path, dict(payload)) + save_training_stats() + + is_new_best = False + if val_loss is not None: + if val_loss < best_val_loss: + best_val_loss = val_loss + best_train_loss = min(best_train_loss, current_train_loss) + is_new_best = True + else: + if current_train_loss < best_train_loss: + best_train_loss = current_train_loss + is_new_best = True + + if is_new_best: + best_checkpoint_path = checkpoint_dir / "best.pt" + save_checkpoint_async(best_checkpoint_path, dict(payload)) + save_training_stats() + if val_loss is not None: + log_message(f"New best checkpoint at step {global_step}: train_loss={current_train_loss:.6f}, val_loss={val_loss:.6f}") + else: + log_message(f"New best checkpoint at step {global_step}: train_loss={current_train_loss:.6f}") + + if config['training']['save_generation']['enabled']: + with torch.no_grad(): + val_batch = next(iter(val_dataloader)) + texts = val_batch['text'][:5] + val_objects = val_batch['objects'][:5] + val_world = val_batch['world'][:5] + val_physics = val_batch.get('physics') + if val_physics is not None: + val_physics = val_physics[:5] + else: + val_physics = torch.zeros_like(val_objects[:, 0, :, :3]) + val_device = val_objects.device + val_batch_size, val_num_frames = val_objects.shape[:2] + anchor_idx = trainer._select_anchor_frame(val_num_frames) + predictions, generated_indices, _ = trainer._generate_full_sequence( + text=texts, + objects=val_objects, + world=val_world, + physics=val_physics, + teacher_prob=0.0, + anchor_idx=anchor_idx, + ) + + val_objects_cpu = val_objects.detach().cpu() + val_world_cpu = val_world.detach().cpu() + val_physics_cpu = val_physics.detach().cpu() + val_batch_cpu = recursive_to_cpu(val_batch) + predictions_cpu = [{ + 'objects': pred['objects'].detach().cpu(), + 'world': pred['world'].detach().cpu(), + 'physics': pred['physics'].detach().cpu(), + } for pred in predictions] + targets_cpu = { + 'objects': val_objects_cpu, + 'world': val_world_cpu, + 'physics': val_physics_cpu, + } + metadata = { + 'sequence_names': val_batch.get('sequence_names', None)[:5] if 'sequence_names' in val_batch else None, + 'generated_indices': generated_indices, + } + save_generation_async( + predictions=predictions_cpu, + targets=targets_cpu, + texts=list(texts), + step=global_step, + save_config=config['training']['save_generation'], + metadata=metadata, + batch_data=val_batch_cpu, + data_root=args.data_root, + data_split='validation' + ) + else: + msg = f"No improvement at step {global_step}: train_loss={current_train_loss:.6f}" + if val_loss is not None: + msg += f", val_loss={val_loss:.6f}" + log_message(msg) + + # Gradient clipping before optimizer step + if accelerator.sync_gradients: + clip_val = config['training'].get('gradient_clip_val', 1.0) + accelerator.clip_grad_norm_(model.parameters(), max_norm=clip_val) + + optimizer.step() + optimizer.zero_grad() + + global_step += 1 + + if global_step >= max_steps: + break + + # Ensure latest plot is written + if accelerator.is_main_process: + update_loss_plot() + + # Ensure asynchronous tasks complete before final save + executor.shutdown(wait=True) + + # Final save + if accelerator.is_main_process: + checkpoint_dir.mkdir(parents=True, exist_ok=True) + final_checkpoint_path = checkpoint_dir / f"step{global_step}_final.pt" + + torch.save({ + 'step': global_step, + 'model_state_dict': model.state_dict(), + 'optimizer_state_dict': optimizer.state_dict(), + 'config': config, + }, final_checkpoint_path) + + # Update best.pt to point to final checkpoint + best_path = checkpoint_dir / "best.pt" + if best_path.exists() or best_path.is_symlink(): + best_path.unlink() + best_path.symlink_to(final_checkpoint_path.name) + + log_message(f"Saved final checkpoint: {final_checkpoint_path}") + + + +if __name__ == "__main__": + main() diff --git a/nano_WaveGen/utils/QUICK_START.txt b/nano_WaveGen/utils/QUICK_START.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6aa1efe54af4210a4a68f685fb9711bad4a799d --- /dev/null +++ b/nano_WaveGen/utils/QUICK_START.txt @@ -0,0 +1,76 @@ +═══════════════════════════════════════════════════════════════ + WaveGen 训练可视化工具 - 快速开始 +═══════════════════════════════════════════════════════════════ + +📦 安装依赖 +─────────────────────────────────────────────────────────────── +pip install viser numpy scipy opencv-python pillow + +🚀 启动方式 +─────────────────────────────────────────────────────────────── +方法1 (推荐): 使用启动脚本 + cd code/WaveGen/nano_WaveGen + ./visualize.sh + +方法2: 直接运行Python脚本 + cd code/WaveGen/nano_WaveGen + python utils/visualize_training.py + +方法3: 指定core_space路径 + python utils/visualize_training.py --core-space ../WaveGen_Augustus_v1/core_space + +🌐 Web界面 +─────────────────────────────────────────────────────────────── +启动后在浏览器打开: http://localhost:8080 + +🎮 基本操作 +─────────────────────────────────────────────────────────────── +1. 在"训练输出"面板选择训练输出和样本 +2. 点击"加载样本"按钮 +3. 使用"帧控制"滑块切换帧 +4. 点击"▶ 播放"查看动画 + +🎨 可视化元素 +─────────────────────────────────────────────────────────────── +• 蓝色 mesh = 模型生成的超二次曲面 +• 红色 mesh = Ground Truth 超二次曲面 +• 点云 = 原始深度图重建的点云 +• 坐标系 = 世界坐标系参考 + +⚡ 快捷操作 +─────────────────────────────────────────────────────────────── +• 鼠标左键拖拽 旋转视角 +• 鼠标右键拖拽 平移视角 +• 滚轮 缩放视角 +• "重置视角" 恢复默认视角 +• "匹配GT相机" 对齐到Ground Truth相机 + +📊 数据要求 +─────────────────────────────────────────────────────────────── +core_space/ +└── YYYYMMDD_HHMMSS_stepN_text2wave/ + └── sample_X/ + ├── predictions.npz (必需) + ├── targets.npz (必需) + └── original_data/ (可选,用于点云) + +🐛 常见问题 +─────────────────────────────────────────────────────────────── +Q: 找不到训练输出? +A: 确保在正确的目录运行脚本,或使用--core-space指定路径 + +Q: 端口被占用? +A: 脚本会自动检测并使用下一个可用端口(8080→8081→8082...) + 也可手动指定: python utils/visualize_training.py --port 8888 + +Q: 不想自动打开浏览器? +A: python utils/visualize_training.py --no-browser + +Q: 点云不显示? +A: 确保original_data/depth/目录存在深度文件 + +📖 详细文档 +─────────────────────────────────────────────────────────────── +查看 utils/README_visualize.md 获取完整文档 + +═══════════════════════════════════════════════════════════════ diff --git a/nano_WaveGen/utils/README_visualize.md b/nano_WaveGen/utils/README_visualize.md new file mode 100644 index 0000000000000000000000000000000000000000..2431eebd9d6ad1ab286b14f1bccbb7691fd7da76 --- /dev/null +++ b/nano_WaveGen/utils/README_visualize.md @@ -0,0 +1,223 @@ +# WaveGen 训练可视化工具 + +独立的训练结果可视化工具,自动检索 `core_space` 目录并通过 Viser 进行 3D 可视化。 + +## 🚀 快速开始 + +### 1. 基本使用 + +在 `nano_WaveGen` 目录下运行: + +```bash +cd code/WaveGen/nano_WaveGen +python utils/visualize_training.py +``` + +### 2. 指定 core_space 目录 + +```bash +python utils/visualize_training.py --core-space ../WaveGen_Augustus_v1/core_space +``` + +### 3. 指定端口 + +```bash +python utils/visualize_training.py --port 8888 +``` + +### 4. 禁用自动打开浏览器 + +```bash +python utils/visualize_training.py --no-browser +``` + +## 📋 功能特性 + +### ✅ 自动端口分配 +- 默认尝试使用 8080 端口 +- 如果端口被占用,自动尝试下一个端口 (8081, 8082, ...) +- 最多尝试 10 个端口 +- 自动在浏览器中打开正确的URL + +### ✅ 自动扫描训练输出 +- 自动检测 `core_space/` 下的所有训练输出目录 +- 格式: `YYYYMMDD_HHMMSS_stepN_text2wave` +- 统计每个输出的样本数量 + +### 🎨 3D 可视化 +- **超二次曲面渲染**: 生成的预测结果(蓝色)和 Ground Truth(红色) +- **点云显示**: 从深度图重建的原始点云 +- **相机可视化**: GT 相机位置和朝向 +- **坐标系显示**: 世界坐标系参考 + +### 🎮 交互控制 +- **训练输出选择**: 下拉菜单选择不同的训练输出 +- **样本选择**: 滑块选择样本索引 +- **帧控制**: 滑块切换帧,播放/暂停动画 +- **颜色和透明度**: 实时调整渲染参数 +- **网格分辨率**: 调整超二次曲面的网格质量 + +### ⚡ 性能优化 +- **对象池机制**: 复用 mesh 对象,减少内存分配 +- **惰性加载**: 按需加载帧数据 +- **缓存策略**: 预加载相邻帧数据 + +## 🎛️ GUI 控制说明 + +### 训练输出面板 +- **选择训练输出**: 选择要查看的训练输出 +- **样本索引**: 选择样本(0 到 N-1) +- **加载样本**: 点击加载选中的样本 + +### 帧控制面板 +- **当前帧**: 滑块切换帧(0-23) +- **▶ 播放**: 开始自动播放动画 +- **⏸ 暂停**: 暂停动画播放 +- **播放FPS**: 设置播放帧率(1-30) + +### 生成结果面板 +- **显示生成的超二次曲面**: 开关预测结果显示 +- **生成结果透明度**: 调整蓝色 mesh 的透明度 +- **生成结果颜色**: 自定义预测结果颜色(默认蓝色) + +### Ground Truth 面板 +- **显示GT超二次曲面**: 开关 GT 显示 +- **GT透明度**: 调整红色 mesh 的透明度 +- **GT颜色**: 自定义 GT 颜色(默认红色) + +### 点云显示面板 +- **显示点云**: 开关点云显示 +- **点大小**: 调整点云渲染大小(0.001-0.02) + +### 渲染设置面板 +- **网格分辨率**: 调整超二次曲面的网格精度(10-50) +- **显示坐标系**: 开关世界坐标系显示 + +### 相机控制面板 +- **重置视角**: 恢复默认观察视角 +- **匹配GT相机**: 将视角设置为 GT 相机视角 + +## 📦 数据格式支持 + +### 支持的文件结构 + +``` +core_space/ +├── 20251204_212328_step5_text2wave/ +│ ├── sample_0/ +│ │ ├── predictions.npz # 模型预测结果 +│ │ ├── targets.npz # Ground Truth +│ │ ├── info.txt # 样本信息 +│ │ └── original_data/ # 原始数据 +│ │ ├── Full_Sample_Data_for_Learning_Target.npz +│ │ ├── depth/ +│ │ │ └── depth_merge.npz # 或 frame_*.npy +│ │ ├── rgb/ +│ │ │ └── frame_*.png +│ │ ├── segmentation/ +│ │ ├── metadata.json +│ │ └── camera_trajectory.npz +│ ├── sample_1/ +│ └── ... +└── 20251128_124329_step5_text2wave/ + └── ... +``` + +### predictions.npz 格式 + +```python +frames = [ + { + 'objects': [max_objects, 15], # 每个对象15个参数 + 'world': [8], # 世界参数 + 'physics': [max_objects, 3] # 物理属性(可选) + }, + ... # 24帧 +] + +# objects 参数 [15]: +# [0]: exists 存在标志 (0/1) +# [1:3]: shape ε1, ε2 +# [3:6]: scale a, b, c +# [6:9]: translation x, y, z +# [9:12]: rotation rx, ry, rz (欧拉角) +# [12:15]: velocity vx, vy, vz +``` + +### targets.npz 格式 + +```python +objects = [num_frames, max_objects, 16] # 16参数(多了inlier_ratio) +world = [num_frames, 11] # 11参数(包含scene_center) +physics = [max_objects, 3] # 物理属性(可选) +``` + +## 🔧 依赖项 + +### Python 包 +- `viser` - 3D 可视化服务器 +- `numpy` - 数值计算 +- `scipy` - 旋转矩阵计算 +- `opencv-python` - 图像处理 +- `pillow` - 图像加载 + +### 本地模块 +- `depth_to_pointcloud.py` - 深度图转点云工具 + +### 安装依赖 + +```bash +pip install viser numpy scipy opencv-python pillow +``` + +## 🎯 使用场景 + +### 1. 检查训练质量 +对比蓝色(预测)和红色(GT)mesh 的重合程度 + +### 2. 调试模型 +实时查看每一帧的生成结果 + +### 3. 制作演示 +通过播放功能展示训练效果 + +### 4. 数据分析 +结合点云查看模型对原始数据的拟合质量 + +## 🐛 常见问题 + +### Q: 找不到训练输出? +A: 确保 `core_space` 目录存在且包含 `*_text2wave` 格式的文件夹。 + +### Q: 无法显示点云? +A: 检查 `original_data/depth/` 目录是否存在深度文件。 + +### Q: Mesh 显示不正确? +A: 尝试调整"网格分辨率"或检查 superquadric 参数是否正确。 + +### Q: 端口被占用? +A: 脚本会自动检测并使用下一个可用端口(8080→8081→8082...),无需手动干预。也可以用 `--port` 参数指定起始端口。 + +### Q: 浏览器没有自动打开? +A: 检查系统的默认浏览器设置。或者使用 `--no-browser` 参数禁用自动打开,然后手动访问终端显示的URL。 + +### Q: 页面一直显示空白? +A: 尝试刷新浏览器(Ctrl+Shift+R)或切换到 Chrome/Firefox 浏览器。 + +## 📝 开发说明 + +### 添加新功能 + +1. 在 `TrainingVisualizer` 类中添加方法 +2. 在 `setup_gui()` 中添加 GUI 控件 +3. 在对应的回调函数中实现逻辑 + +### 性能优化 + +- 使用对象池避免重复创建 mesh +- 预加载相邻帧数据 +- 调整网格分辨率平衡质量和性能 + +## 📄 许可 + +与 WaveGen 项目保持一致 diff --git a/nano_WaveGen/utils/__init__.py b/nano_WaveGen/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cede9ee5f5caa1e6a905560d881062933eb31a0a --- /dev/null +++ b/nano_WaveGen/utils/__init__.py @@ -0,0 +1,6 @@ +"""Utility package exports for WaveGen.""" + +from .model_utils import * # noqa: F401,F403 + +__all__ = [] +__all__ += [name for name in globals() if not name.startswith('_')] diff --git a/nano_WaveGen/utils/__pycache__/__init__.cpython-311.pyc b/nano_WaveGen/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0710ed76fb457cfc4ae0b91525495a1a05c7e078 Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/__init__.cpython-312.pyc b/nano_WaveGen/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2777d09a2758c5887a48ad543435de98bf1685b4 Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/dataparallel_utils.cpython-312.pyc b/nano_WaveGen/utils/__pycache__/dataparallel_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69f9e8bc3ae41c1772c6e3bfd3fe1c98f89548a3 Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/dataparallel_utils.cpython-312.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/depth_to_pointcloud.cpython-311.pyc b/nano_WaveGen/utils/__pycache__/depth_to_pointcloud.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d59a5b1ef06b9f88320e148a70398a9e860fa571 Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/depth_to_pointcloud.cpython-311.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/depth_to_pointcloud.cpython-312.pyc b/nano_WaveGen/utils/__pycache__/depth_to_pointcloud.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87687ce814ca17013042ccdb72470faee85db199 Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/depth_to_pointcloud.cpython-312.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/gpu_utils.cpython-311.pyc b/nano_WaveGen/utils/__pycache__/gpu_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f96c54d803b9a9098d9ffaa96962244539d1946c Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/gpu_utils.cpython-311.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/gpu_utils.cpython-312.pyc b/nano_WaveGen/utils/__pycache__/gpu_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d41a41da4f5c80d6c01ae76d72dffff88507e3ec Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/gpu_utils.cpython-312.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/model_utils.cpython-311.pyc b/nano_WaveGen/utils/__pycache__/model_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7be1d9a723dad69eb9a383eea30fe2096ef8d869 Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/model_utils.cpython-311.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/model_utils.cpython-312.pyc b/nano_WaveGen/utils/__pycache__/model_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..52daab4f5f41d9396a7eb7c2afd270dd0cc0c39e Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/model_utils.cpython-312.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/parallel_encoder_wrapper.cpython-312.pyc b/nano_WaveGen/utils/__pycache__/parallel_encoder_wrapper.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65987724c39796d85d9f41a1dc3fde96a89b0bdf Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/parallel_encoder_wrapper.cpython-312.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/save_generation_results.cpython-311.pyc b/nano_WaveGen/utils/__pycache__/save_generation_results.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d5cba6936e8cac5a7555763adeb81e51815c6f8d Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/save_generation_results.cpython-311.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/save_generation_results.cpython-312.pyc b/nano_WaveGen/utils/__pycache__/save_generation_results.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..64f03a728ce1b9b3f631c55cc69272089ad8261d Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/save_generation_results.cpython-312.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/utils.cpython-311.pyc b/nano_WaveGen/utils/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22a410f319a8d6d97cd3f3c4b771cc6068879b5e Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/utils.cpython-311.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/utils.cpython-312.pyc b/nano_WaveGen/utils/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7fa1ac1623ae2d9a790b8b342dfb2625f685cea3 Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/utils.cpython-312.pyc differ diff --git a/nano_WaveGen/utils/__pycache__/visualize_training.cpython-312.pyc b/nano_WaveGen/utils/__pycache__/visualize_training.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc0d16565e208fe712b90f4e785a31d558bf89e8 Binary files /dev/null and b/nano_WaveGen/utils/__pycache__/visualize_training.cpython-312.pyc differ diff --git a/nano_WaveGen/utils/depth_to_pointcloud.py b/nano_WaveGen/utils/depth_to_pointcloud.py new file mode 100644 index 0000000000000000000000000000000000000000..8deb6fcb98595539863531ec8b4232acd185412d --- /dev/null +++ b/nano_WaveGen/utils/depth_to_pointcloud.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python3 +""" +深度图转点云工具 +用于将MetricDepth深度图和RGB图像转换为点云 +增强版本:支持MOVi数据集的特殊处理 +""" + +import numpy as np +from pathlib import Path +from typing import Tuple, Optional, List, Dict +import cv2 +from PIL import Image +from scipy.spatial.transform import Rotation + +class DepthToPointCloud: + def __init__(self, + min_depth: float = 0.1, + max_depth: float = 100.0, + max_points: int = 50000): + """ + 初始化深度转点云工具 + + Args: + min_depth: 最小有效深度(米) + max_depth: 最大有效深度(米) + max_points: 最大点数(用于下采样) + """ + self.min_depth = min_depth + self.max_depth = max_depth + self.max_points = max_points + + def load_depth_npz(self, npz_path: str) -> np.ndarray: + """ + 加载MetricDepth的NPZ文件 + + Args: + npz_path: NPZ文件路径 + + Returns: + depth_map: 深度图(米为单位) + """ + data = np.load(npz_path) + + # MetricDepth通常将深度存储在'depth'或'pred'键中 + if 'depth' in data: + depth = data['depth'] + elif 'pred' in data: + depth = data['pred'] + else: + # 尝试获取第一个数组 + keys = list(data.keys()) + if len(keys) > 0: + depth = data[keys[0]] + else: + raise ValueError(f"无法从NPZ文件中找到深度数据: {npz_path}") + + # 确保是2D数组 + if depth.ndim == 3 and depth.shape[0] == 1: + depth = depth[0] + elif depth.ndim == 3 and depth.shape[2] == 1: + depth = depth[:, :, 0] + + return depth.astype(np.float32) + + def load_rgb_image(self, image_path: str, target_size: Optional[Tuple[int, int]] = None) -> np.ndarray: + """ + 加载RGB图像 + + Args: + image_path: 图像路径 + target_size: 目标大小 (H, W),如果指定则调整大小 + + Returns: + rgb: RGB图像数组 (H, W, 3),值域[0, 255] + """ + img = Image.open(image_path) + + # 转换为RGB + if img.mode != 'RGB': + img = img.convert('RGB') + + # 调整大小(如果需要) + if target_size is not None: + img = img.resize((target_size[1], target_size[0]), Image.BILINEAR) + + return np.array(img, dtype=np.uint8) + + def compute_world_scale(self, depth_maps: List[np.ndarray]) -> float: + """ + 计算世界尺度(用于归一化) + + Args: + depth_maps: 深度图列表 + + Returns: + world_scale: 场景的特征尺度 + """ + all_depths = [] + + for depth_map in depth_maps: + valid_depths = depth_map[(depth_map > self.min_depth) & (depth_map < self.max_depth)] + if len(valid_depths) > 0: + all_depths.extend(valid_depths.flatten()) + + if len(all_depths) == 0: + return 10.0 # 默认值 + + # 使用85%分位数作为场景尺度 + all_depths = np.array(all_depths) + world_scale = np.percentile(all_depths, 85) + + return float(world_scale) + + def depth_to_pointcloud(self, + depth_map: np.ndarray, + rgb_image: np.ndarray, + world_scale: float, + camera_intrinsics: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]: + """ + 将深度图转换为点云 + + Args: + depth_map: 深度图 (H, W),米为单位 + rgb_image: RGB图像 (H, W, 3) + world_scale: 世界尺度,用于归一化Z坐标 + camera_intrinsics: 相机内参矩阵 (3, 3),如果为None则使用默认值 + + Returns: + points: 点云坐标 (N, 3),XY归一化到[-1, 1],Z归一化到[0, 1] + colors: 点云颜色 (N, 3),值域[0, 255] + """ + h, w = depth_map.shape + + # 确保RGB图像大小匹配 + if rgb_image.shape[:2] != (h, w): + rgb_image = cv2.resize(rgb_image, (w, h), interpolation=cv2.INTER_LINEAR) + + # 创建像素坐标网格 + u = np.arange(w) + v = np.arange(h) + u_grid, v_grid = np.meshgrid(u, v) + + # 如果没有相机内参,使用默认值 + if camera_intrinsics is None: + # 假设视场角为60度 + fx = fy = w / (2 * np.tan(np.radians(30))) + cx = w / 2 + cy = h / 2 + else: + fx = camera_intrinsics[0, 0] + fy = camera_intrinsics[1, 1] + cx = camera_intrinsics[0, 2] + cy = camera_intrinsics[1, 2] + + # 反投影到3D空间 + z = depth_map + x = (u_grid - cx) * z / fx + y = (v_grid - cy) * z / fy + + # 展平数组 + x_flat = x.flatten() + y_flat = y.flatten() + z_flat = z.flatten() + + # 获取颜色 + r = rgb_image[:, :, 0].flatten() + g = rgb_image[:, :, 1].flatten() + b = rgb_image[:, :, 2].flatten() + + # 过滤有效深度 + valid_mask = (z_flat > self.min_depth) & (z_flat < self.max_depth) + + if valid_mask.sum() < 100: # 太少有效点 + # 返回空点云 + return np.zeros((0, 3), dtype=np.float32), np.zeros((0, 3), dtype=np.uint8) + + # 应用mask + x_valid = x_flat[valid_mask] + y_valid = y_flat[valid_mask] + z_valid = z_flat[valid_mask] + r_valid = r[valid_mask] + g_valid = g[valid_mask] + b_valid = b[valid_mask] + + # 归一化坐标 + # XY归一化到[-1, 1](基于深度加权的范围) + x_range = np.percentile(np.abs(x_valid), 95) + y_range = np.percentile(np.abs(y_valid), 95) + + x_normalized = x_valid / (x_range + 1e-6) + y_normalized = y_valid / (y_range + 1e-6) + + # Z归一化(除以世界尺度) + z_normalized = z_valid / world_scale + + # 创建点云 + points = np.stack([x_normalized, y_normalized, z_normalized], axis=1) + colors = np.stack([r_valid, g_valid, b_valid], axis=1) + + # 随机下采样到最大点数 + if len(points) > self.max_points: + indices = np.random.choice(len(points), self.max_points, replace=False) + points = points[indices] + colors = colors[indices] + + return points.astype(np.float32), colors.astype(np.uint8) + + def process_sequence(self, + depth_folder: Path, + rgb_folder: Path, + num_frames: Optional[int] = None) -> List[Tuple[np.ndarray, np.ndarray]]: + """ + 处理整个序列 + + Args: + depth_folder: 深度NPZ文件夹路径 + rgb_folder: RGB图像文件夹路径 + num_frames: 处理的帧数,None表示处理所有帧 + + Returns: + point_clouds: [(points, colors), ...] 列表 + """ + # 获取深度文件列表 + depth_files = sorted(list(depth_folder.glob("*.npz"))) + rgb_files = sorted(list(rgb_folder.glob("*.jpg")) + list(rgb_folder.glob("*.png"))) + + if len(depth_files) == 0: + raise ValueError(f"未找到深度文件: {depth_folder}") + if len(rgb_files) == 0: + raise ValueError(f"未找到RGB文件: {rgb_folder}") + + # 确保文件数量匹配 + num_frames_available = min(len(depth_files), len(rgb_files)) + if num_frames is None: + num_frames = num_frames_available + else: + num_frames = min(num_frames, num_frames_available) + + print(f"处理 {num_frames} 帧...") + + # 首先加载所有深度图以计算世界尺度 + depth_maps = [] + for i in range(num_frames): + depth_map = self.load_depth_npz(str(depth_files[i])) + depth_maps.append(depth_map) + + # 计算世界尺度 + world_scale = self.compute_world_scale(depth_maps) + print(f"计算得到世界尺度: {world_scale:.2f} 米") + + # 转换每一帧 + point_clouds = [] + for i in range(num_frames): + # 加载RGB图像 + rgb_image = self.load_rgb_image(str(rgb_files[i])) + + # 转换为点云 + points, colors = self.depth_to_pointcloud( + depth_maps[i], + rgb_image, + world_scale + ) + + point_clouds.append((points, colors)) + print(f"帧 {i+1}/{num_frames}: {len(points)} 个点") + + return point_clouds + + def depth_to_normalized_pointcloud_movi(self, + depth: np.ndarray, + segmentation: Optional[np.ndarray] = None, + camera_K: Optional[np.ndarray] = None, + camera_position: Optional[np.ndarray] = None, + camera_quaternion: Optional[np.ndarray] = None, + resolution: int = 128, + convert_to_zdepth: bool = True, + scene_center_override: Optional[np.ndarray] = None, + scene_scale_override: Optional[float] = None) -> Tuple: + """ + MOVi数据集专用:将深度图转换为归一化点云 [-10, 10] + 与训练代码保持完全一致的实现 + + Args: + depth: (H, W, 1) 深度数组(欧几里得距离) + segmentation: (H, W) 实例分割掩码 + camera_K: 3x3 相机内参矩阵 + camera_position: 相机在世界坐标系中的位置 + camera_quaternion: 相机四元数 (x,y,z,w) 格式 + resolution: 图像分辨率(假设为正方形) + convert_to_zdepth: 是否将欧几里得深度转换为Z深度 + + Returns: + tuple: (instance_pointclouds, points_3d_normalized, segmentation, scene_center, scene_extent) + """ + H, W = depth.shape[:2] + + # Get camera parameters + fx = camera_K[0, 0] + fy = camera_K[1, 1] + cx = camera_K[0, 2] + cy = camera_K[1, 2] + + # Create pixel grid + xx, yy = np.meshgrid(np.arange(W), np.arange(H)) + + # Convert to normalized camera coordinates + x_norm = (xx - cx) / fx + y_norm = (yy - cy) / fy + + if convert_to_zdepth: + # MOVi uses euclidean distance, convert to z-depth (planar depth) + # For each pixel, we have: euclidean_dist^2 = x^2 + y^2 + z^2 + # Where x = x_norm * z, y = y_norm * z + # So: euclidean_dist^2 = (x_norm^2 + y_norm^2 + 1) * z^2 + z = depth[:, :, 0] / np.sqrt(x_norm**2 + y_norm**2 + 1) + else: + # Use depth as-is (assume it's already z-depth) + z = depth[:, :, 0] + + # Get 3D points + x = x_norm * z + y = y_norm * z + + # Stack to get point cloud (in camera coordinates) + points_3d_camera = np.stack([x, y, z], axis=-1) + + # Transform from camera to world coordinates if camera pose is provided + if camera_position is not None and camera_quaternion is not None: + # Convert quaternion to rotation matrix + # MOVi uses [x, y, z, w] format + cam_rot = Rotation.from_quat(camera_quaternion) + cam_rot_matrix = cam_rot.as_matrix() + + # Transform points: World = R * Camera + T + points_3d_flat = points_3d_camera.reshape(-1, 3) + points_3d_world = points_3d_flat @ cam_rot_matrix.T + camera_position + points_3d = points_3d_world.reshape(points_3d_camera.shape) + else: + points_3d = points_3d_camera + + + # Normalize entire scene to [-10, 10] range + # Find scene bounds (only valid depth points) + valid_mask = z > 0 + valid_points = points_3d[valid_mask] + + if scene_center_override is not None and scene_scale_override is not None: + scene_center = np.array(scene_center_override, dtype=np.float32) + scene_extent = 20.0 / float(scene_scale_override) if scene_scale_override != 0 else 1.0 + points_3d_normalized = (points_3d - scene_center) * float(scene_scale_override) + elif len(valid_points) > 0: + # Find scene extent + scene_min = np.min(valid_points, axis=0) + scene_max = np.max(valid_points, axis=0) + scene_center = (scene_min + scene_max) / 2 + scene_extent = np.max(scene_max - scene_min) + + # Scale to [-10, 10] + if scene_extent > 0: + scale_factor = 20.0 / scene_extent # 20 because we want -10 to 10 + points_3d_normalized = (points_3d - scene_center) * scale_factor + else: + points_3d_normalized = points_3d - scene_center + else: + points_3d_normalized = points_3d + scene_center = np.zeros(3) + scene_extent = 1.0 + + # Get unique instance IDs (excluding background=0); if segmentation缺失则使用全0 + if segmentation is None: + segmentation = np.zeros(depth.shape[:2], dtype=np.int32) + instance_ids = np.unique(segmentation) + instance_ids = instance_ids[instance_ids > 0] + + instance_pointclouds = {} + + for inst_id in instance_ids: + # Get mask for this instance + mask = segmentation == inst_id + + # Extract points for this instance (already normalized with scene) + instance_points = points_3d_normalized[mask] + + if len(instance_points) < 50: # Skip if too few points + continue + + instance_pointclouds[int(inst_id)] = instance_points + + # Also return the full scene point cloud and segmentation for visualization + return instance_pointclouds, points_3d_normalized, segmentation, scene_center, scene_extent + + def process_movi_frame(self, + depth_path: str, + segmentation_path: Optional[str] = None, + rgb_path: Optional[str] = None, + camera_K: Optional[np.ndarray] = None, + camera_position: Optional[np.ndarray] = None, + camera_quaternion: Optional[np.ndarray] = None) -> Dict: + """ + 处理MOVi数据集的单帧 + + Args: + depth_path: 深度文件路径(.npy格式) + segmentation_path: 分割文件路径(.npy格式) + rgb_path: RGB图像路径 + camera_K: 相机内参 + camera_position: 相机位置 + camera_quaternion: 相机旋转 + + Returns: + dict: 包含点云和相关信息的字典 + """ + # 加载深度 + depth = np.load(depth_path) + + # 加载分割(如果有) + segmentation = None + if segmentation_path and Path(segmentation_path).exists(): + segmentation = np.load(segmentation_path) + + # 加载RGB(如果有) + rgb = None + if rgb_path and Path(rgb_path).exists(): + rgb = self.load_rgb_image(rgb_path) + + # 转换为点云 + result = self.depth_to_normalized_pointcloud_movi( + depth=depth, + segmentation=segmentation, + camera_K=camera_K, + camera_position=camera_position, + camera_quaternion=camera_quaternion, + convert_to_zdepth=True + ) + + # 添加RGB信息 + result['rgb_image'] = rgb + + return result diff --git a/nano_WaveGen/utils/gpu_utils.py b/nano_WaveGen/utils/gpu_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..00df719ca25f1778d0ad926d12ba858763b190f0 --- /dev/null +++ b/nano_WaveGen/utils/gpu_utils.py @@ -0,0 +1,28 @@ +"""Thin wrapper around shared gpu_utils helpers.""" + +from importlib.util import module_from_spec, spec_from_file_location +from pathlib import Path + +_PROJECT_ROOT = Path(__file__).resolve().parents[3] +_SHARED_GPU_UTILS = _PROJECT_ROOT / "utils" / "gpu_utils.py" + +_spec = spec_from_file_location( + "wavegen_shared_gpu_utils", _SHARED_GPU_UTILS +) +if _spec is None or _spec.loader is None: + raise ModuleNotFoundError( + f"Shared gpu_utils module not found at {_SHARED_GPU_UTILS}" + ) + +_module = module_from_spec(_spec) +_spec.loader.exec_module(_module) + +DEFAULT_THRESHOLD_MB = _module.DEFAULT_THRESHOLD_MB +query_gpu_memory = _module.query_gpu_memory +select_gpus = _module.select_gpus + +__all__ = [ + "DEFAULT_THRESHOLD_MB", + "query_gpu_memory", + "select_gpus", +] diff --git a/nano_WaveGen/utils/model_utils.py b/nano_WaveGen/utils/model_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c2ea40bc4a27113a02ed5dd6445600102601672b --- /dev/null +++ b/nano_WaveGen/utils/model_utils.py @@ -0,0 +1,50 @@ +"""Utility functions for handling model attributes with DataParallel""" + +import torch.nn as nn + +def get_model_attr(model, attr_name): + """ + Get attribute from model, handling DataParallel wrapper + + Args: + model: Model (possibly wrapped in DataParallel) + attr_name: Attribute name to get + + Returns: + Attribute value + """ + if isinstance(model, nn.DataParallel): + return getattr(model.module, attr_name) + else: + return getattr(model, attr_name) + +def set_model_attr(model, attr_name, value): + """ + Set attribute on model, handling DataParallel wrapper + + Args: + model: Model (possibly wrapped in DataParallel) + attr_name: Attribute name to set + value: Value to set + """ + if isinstance(model, nn.DataParallel): + setattr(model.module, attr_name, value) + else: + setattr(model, attr_name, value) + +def call_model_method(model, method_name, *args, **kwargs): + """ + Call method on model, handling DataParallel wrapper + + Args: + model: Model (possibly wrapped in DataParallel) + method_name: Method name to call + *args, **kwargs: Arguments to pass to method + + Returns: + Method return value + """ + if isinstance(model, nn.DataParallel): + return getattr(model.module, method_name)(*args, **kwargs) + else: + return getattr(model, method_name)(*args, **kwargs) \ No newline at end of file diff --git a/nano_WaveGen/utils/save_generation_results.py b/nano_WaveGen/utils/save_generation_results.py new file mode 100644 index 0000000000000000000000000000000000000000..899bdad8b4bac61e6aa0d2cb8db747ff555d3ff9 --- /dev/null +++ b/nano_WaveGen/utils/save_generation_results.py @@ -0,0 +1,590 @@ +""" +保存生成结果的工具函数 +用于WaveGen v33 - 使用超二次元函数的版本 +""" +import os +import numpy as np +import torch +from pathlib import Path +from typing import Dict, List, Optional +import json +from datetime import datetime +import shutil + + +def save_generation_results( + predictions: List[Dict], + targets: Dict[str, torch.Tensor], + texts: List[str], + step: int, + output_dir: str = "outputs", + save_config: Dict = None, + metadata: Dict = None, + batch_data: Dict = None, # 新增:完整的批次数据 + data_root: str = None, # 新增:原始数据根目录 + data_split: str = "validation" # 新增:数据集split(train/validation) +): + """ + 保存生成结果用于可视化和分析(增强版) + + Args: + predictions: 模型预测结果列表,每个元素包含 'objects', 'world', 'physics' + targets: 真实目标数据 + texts: 输入文本描述 + step: 当前训练步数 + output_dir: 输出目录 + save_config: 保存配置 + metadata: 额外的元数据(如序列名称、相机参数等) + batch_data: 完整的批次数据,用于获取更多原始信息 + data_root: 原始数据根目录,用于复制原始文件 + data_split: 数据集split('train' 或 'validation'),用于确定原始数据的正确位置 + """ + # 创建时间戳目录 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + save_path = Path(output_dir) / f"{timestamp}_step{step}_text2wave" + save_path.mkdir(parents=True, exist_ok=True) + + # 保存配置 + if save_config is not None: + with open(save_path / "save_config.json", 'w') as f: + json.dump(save_config, f, indent=2) + + batch_size = len(texts) + num_frames = len(predictions) + + print(f"\n{'='*60}") + print(f"💾 保存生成结果: step {step}") + print(f"📁 保存路径: {save_path}") + print(f"📊 样本数: {batch_size}, 帧数: {num_frames}") + print(f"{'='*60}\n") + + for i in range(batch_size): + try: + sample_dir = save_path / f"sample_{i}" + sample_dir.mkdir(exist_ok=True) + print(f"正在保存 sample_{i}... ", end='', flush=True) + + # 1. 保存文本描述和元信息 + with open(sample_dir / "info.txt", 'w') as f: + f.write(f"Text: {texts[i]}\n") + f.write(f"Generated at step: {step}\n") + f.write(f"Number of frames: {num_frames}\n") + + if metadata and 'sequence_names' in metadata and metadata['sequence_names'] is not None: + f.write(f"Sequence: {metadata['sequence_names'][i]}\n") + + f.write("\n--- Model Output Summary ---\n") + f.write(f"Max objects: {predictions[0]['objects'].shape[1]}\n") + f.write("Object parameters: 15 (exists + shape[2] + scale[3] + translation[3] + rotation[3] + velocity[3])\n") + f.write(f"World parameters: 8 (camera_pos[3] + camera_quat[4] + scene_scale[1])\n") + f.write(f"Physics parameters: 3 (mass + friction + restitution)\n") + + # 2. 保存生成的超二次元函数参数(改进格式) + frame_predictions = [] + physics_per_frame = [] # 保存每一帧的physics预测,便于查看全序列 + for f_idx, pred in enumerate(predictions): + # predictions列表中每个元素包含当前帧、完整batch的数据,这里先取出当前样本 + objects_batch = pred['objects'] + world_batch = pred['world'] + physics_batch = pred.get('physics') + + objects_params = objects_batch[i].cpu().numpy() if hasattr(objects_batch, 'cpu') else objects_batch[i] # [max_objects, 15] + world_params = world_batch[i].cpu().numpy() if hasattr(world_batch, 'cpu') else world_batch[i] # [8] + if physics_batch is not None: + physics_params = physics_batch[i].cpu().numpy() if hasattr(physics_batch, 'cpu') else physics_batch[i] # [max_objects, 3] + else: + physics_params = np.zeros((objects_params.shape[0], 3), dtype=np.float32) + + # 保存当前帧的 physics(每帧各物体的mass/friction/restitution) + physics_per_frame.append([ + { + 'mass': float(phys_params[0]), + 'friction': float(phys_params[1]), + 'restitution': float(phys_params[2]), + } + for phys_params in physics_params + ]) + + # 将物体参数转换为更易读的格式 + superquadrics = [] + for obj_idx in range(objects_params.shape[0]): + obj_params = objects_params[obj_idx] + phys_params = physics_params[obj_idx] + + superquadric = { + 'exists': bool(obj_params[0] > 0.5), # exists flag + 'shape': obj_params[1:3], # epsilon1, epsilon2 + 'scale': obj_params[3:6], # a, b, c + 'translation': obj_params[6:9], # x, y, z + 'rotation': obj_params[9:12], # euler angles + # 预测没有 inlier_ratio,填充0以保持键一致 + 'inlier_ratio': 0.0, + 'velocity': obj_params[12:15], # vx, vy, vz + 'mass': phys_params[0], + 'friction': phys_params[1], + 'restitution': phys_params[2], + } + superquadrics.append(superquadric) + + # 将世界参数转换为更易读的格式 + world_info = { + 'camera_position': world_params[0:3], # x, y, z + 'camera_quaternion': world_params[3:7], # w, x, y, z + 'scene_scale': float(world_params[7]), # scale + # 预测没有scene_center,填零保持字段一致,方便下游读取 + 'scene_center': np.zeros(3, dtype=np.float32), + } + + frame_data = { + 'frame_idx': f_idx, + 'superquadrics': superquadrics, + 'world_info': world_info, + } + frame_predictions.append(frame_data) + + np.savez(sample_dir / "predictions.npz", + text=texts[i], + frames=frame_predictions, + num_frames=num_frames, + # 保存全序列physics;未预测则写None + physics=physics_per_frame if physics_per_frame else None, + sequence_name=metadata['sequence_names'][i] if (metadata and 'sequence_names' in metadata and metadata['sequence_names'] is not None) else "unknown", + description="Predicted superquadric parameters for each frame") + + # 3. 保存真实目标数据(如果有)- 改进格式 + if targets is not None: + # targets中的数据已经是完整批次,需要索引[i]获取当前样本 + target_objects = targets['objects'][i].cpu().numpy() if hasattr(targets['objects'], 'cpu') else targets['objects'][i] # [num_frames, max_objects, 16] + target_world = targets['world'][i].cpu().numpy() if hasattr(targets['world'], 'cpu') else targets['world'][i] # [num_frames, 11] + + if 'physics' in targets and targets['physics'] is not None: + target_physics = targets['physics'][i].cpu().numpy() if hasattr(targets['physics'], 'cpu') else targets['physics'][i] + else: + target_physics = None + + # 生成顶层 physics,与原始 Full_Sample_Data_for_Learning_Target 一致 + target_physics_top = None + if target_physics is not None: + target_physics_top = [ + { + 'mass': float(p[0]), + 'friction': float(p[1]), + 'restitution': float(p[2]), + } + for p in target_physics + ] + + # 将目标数据转换为更易读的格式 + target_frames = [] + for f_idx in range(target_objects.shape[0]): + frame_objects = target_objects[f_idx] # [max_objects, 16] + frame_world = target_world[f_idx] # [11] + + # 转换物体参数 + superquadrics = [] + for obj_idx in range(frame_objects.shape[0]): + obj_params = frame_objects[obj_idx] + phys_params = target_physics[obj_idx] if target_physics is not None else np.zeros(3) + + superquadric = { + 'exists': bool(obj_params[0] > 0.5), # exists flag + 'shape': obj_params[1:3], # epsilon1, epsilon2 + 'scale': obj_params[3:6], # a, b, c + 'translation': obj_params[6:9], # x, y, z + 'rotation': obj_params[9:12], # euler angles + 'inlier_ratio': float(obj_params[12]), # GT specific: inlier ratio + 'velocity': obj_params[13:16], # vx, vy, vz + 'mass': phys_params[0], + 'friction': phys_params[1], + 'restitution': phys_params[2], + } + superquadrics.append(superquadric) + + # 转换世界参数 + world_info = { + 'camera_position': frame_world[0:3], # x, y, z + 'camera_quaternion': frame_world[3:7], # w, x, y, z + 'scene_scale': float(frame_world[7]), # scale + 'scene_center': frame_world[8:11], # center x, y, z + } + + frame_data = { + 'frame_idx': f_idx, + 'superquadrics': superquadrics, + 'world_info': world_info, + } + target_frames.append(frame_data) + + # 保存改进格式的 targets.npz + np.savez(sample_dir / "targets.npz", + text=texts[i], + frames=target_frames, + num_frames=num_frames, + physics=target_physics_top if target_physics_top is not None else None, + sequence_name=metadata['sequence_names'][i] if (metadata and 'sequence_names' in metadata and metadata['sequence_names'] is not None) else "unknown", + description="Ground truth superquadric parameters for each frame") + + # 为了兼容性,也保存原始格式(用于误差计算) + target_data_legacy = { + 'objects': target_objects, + 'world': target_world, + 'physics': target_physics, + } + + # 计算并保存误差统计(使用旧格式) + save_error_statistics(frame_predictions, target_data_legacy, sample_dir) + + # 4. 保存相机参数(如果有) + if metadata and 'camera_data' in metadata: + camera_data = metadata['camera_data'][i] + np.savez(sample_dir / "camera_params.npz", + **camera_data) + + # 5. 保存原始数据(新增功能,不再依赖camera_data) + if batch_data is not None and data_root is not None: + save_original_data(sample_dir, i, batch_data, metadata, data_root, data_split) + + print("✅") # 完成标记 + + except Exception as e: + print(f"❌ 错误: {e}") + import traceback + traceback.print_exc() + continue # 继续保存其他样本 + + # 注意:已移除save_visualization_script,因为不需要单独的可视化脚本 + + # 保存整体统计信息 + save_batch_statistics(predictions, targets, save_path) + + print(f"\n{'='*60}") + print(f"✅ 保存完成!") + print(f"📁 保存路径: {save_path}") + print(f"{'='*60}\n") + + return save_path + + +def save_error_statistics(predictions: List[Dict], targets: Dict, save_dir: Path): + """计算并保存预测误差统计 + + Args: + predictions: 新格式的帧列表 (包含 superquadrics 和 world_info) + targets: 旧格式的目标数据 (包含 objects, world 数组) + """ + stats = {} + + # 将新格式的 predictions 转换回数组格式进行误差计算 + object_errors = [] + world_errors = [] + + for frame in predictions: + frame_idx = frame['frame_idx'] + + # 从新格式重建物体数组 + superquadrics = frame['superquadrics'] + pred_objects = [] + for sq in superquadrics: + obj_params = np.zeros(15, dtype=np.float32) + obj_params[0] = 1.0 if sq['exists'] else 0.0 + obj_params[1:3] = sq['shape'] + obj_params[3:6] = sq['scale'] + obj_params[6:9] = sq['translation'] + obj_params[9:12] = sq['rotation'] + obj_params[12:15] = sq['velocity'] + pred_objects.append(obj_params) + pred_obj = np.array(pred_objects) + + # 获取目标物体数据 + target_obj_full = targets['objects'][frame_idx] + target_obj = target_obj_full[:, :pred_obj.shape[1]] # 对齐模型预测的维度 + + # 只计算存在的物体 + exists_mask = target_obj[:, 0] > 0.5 + if exists_mask.any(): + error = np.mean(np.abs(pred_obj[exists_mask] - target_obj[exists_mask])) + object_errors.append(error) + + # 从新格式重建世界参数数组 + world_info = frame['world_info'] + pred_world = np.concatenate([ + world_info['camera_position'], + world_info['camera_quaternion'], + [world_info['scene_scale']] + ]) + + # 获取目标世界数据 + target_world = targets['world'][frame_idx][:8] + error = np.mean(np.abs(pred_world - target_world)) + world_errors.append(error) + + stats['object_mae'] = float(np.mean(object_errors)) if object_errors else 0.0 + stats['world_mae'] = float(np.mean(world_errors)) + + # 保存统计信息 + with open(save_dir / "error_statistics.json", 'w') as f: + json.dump(stats, f, indent=2) + + +def save_batch_statistics(predictions: List[Dict], targets: Dict, save_dir: Path): + """保存整批数据的统计信息""" + batch_size = predictions[0]['objects'].shape[0] + + stats = { + 'batch_size': batch_size, + 'num_frames': len(predictions), + 'timestamp': datetime.now().isoformat(), + } + + # 统计每帧实际存在的物体数量 + if targets is not None: + objects_per_frame = [] + for f_idx in range(len(predictions)): + frame_objects = [] + for b_idx in range(batch_size): + exists = targets['objects'][b_idx, f_idx, :, 0] > 0.5 + frame_objects.append(int(exists.sum())) + objects_per_frame.append({ + 'frame': f_idx, + 'mean_objects': float(np.mean(frame_objects)), + 'max_objects': int(max(frame_objects)), + 'min_objects': int(min(frame_objects)), + }) + stats['objects_per_frame'] = objects_per_frame + + with open(save_dir / "batch_statistics.json", 'w') as f: + json.dump(stats, f, indent=2) + + +def save_visualization_script(save_dir: Path): + """保存用于可视化超二次元函数的Python脚本""" + script = '''#!/usr/bin/env python3 +""" +可视化生成的超二次元函数参数 +""" +import numpy as np +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D + +def superquadric_surface(a, b, c, e1, e2, n=50): + """生成超二次元函数表面点""" + eta = np.linspace(-np.pi/2, np.pi/2, n) + omega = np.linspace(-np.pi, np.pi, n) + eta, omega = np.meshgrid(eta, omega) + + x = a * np.sign(np.cos(eta)) * np.abs(np.cos(eta))**e1 * np.sign(np.cos(omega)) * np.abs(np.cos(omega))**e2 + y = b * np.sign(np.cos(eta)) * np.abs(np.cos(eta))**e1 * np.sign(np.sin(omega)) * np.abs(np.sin(omega))**e2 + z = c * np.sign(np.sin(eta)) * np.abs(np.sin(eta))**e1 + + return x, y, z + +# 加载预测数据 +data = np.load('predictions.npz', allow_pickle=True) +frames = data['frames'] + +# 可视化第一帧 +frame = frames[0] +objects = frame['objects'] # [max_objects, 12] + +fig = plt.figure(figsize=(12, 8)) +ax = fig.add_subplot(111, projection='3d') + +# 绘制每个存在的物体 +for obj_idx, obj_params in enumerate(objects): + if obj_params[0] > 0.5: # 物体存在 + # 提取参数 + shape_params = obj_params[1:3] # e1, e2 + scale = obj_params[3:6] # a, b, c + translation = obj_params[6:9] + rotation = obj_params[9:12] # 简化处理,暂不应用旋转 + + # 生成表面 + x, y, z = superquadric_surface(scale[0], scale[1], scale[2], + shape_params[0], shape_params[1]) + + # 应用平移 + x += translation[0] + y += translation[1] + z += translation[2] + + # 绘制 + ax.plot_surface(x, y, z, alpha=0.7, label=f'Object {obj_idx}') + +ax.set_xlabel('X') +ax.set_ylabel('Y') +ax.set_zlabel('Z') +ax.set_title('Generated Superquadric Objects') +plt.savefig('visualization.png') +plt.show() +''' + + with open(save_dir / "visualize.py", 'w') as f: + f.write(script) + + # 使脚本可执行 + os.chmod(save_dir / "visualize.py", 0o755) + + +def save_original_data(sample_dir: Path, sample_idx: int, batch_data: Dict, metadata: Dict, data_root: str, data_split: str = "validation"): + """ + 保存原始数据文件,包括RGB图像、深度图、分割图、点云等 + + Args: + sample_dir: 当前样本保存目录 + sample_idx: 批次中的样本索引 + batch_data: 批次数据 + metadata: 元数据 + data_root: MOVi数据集根目录 + data_split: 数据集split('train' 或 'validation') + """ + try: + # 获取原始序列名称 + sequence_name = None + if metadata and 'sequence_names' in metadata and metadata['sequence_names'] is not None: + sequence_name = metadata['sequence_names'][sample_idx] + + if not sequence_name: + print(f"Warning: No sequence name found for sample {sample_idx}") + # 创建一个说明文件,解释为什么没有原始数据 + no_original_data_file = sample_dir / "no_original_data.txt" + with open(no_original_data_file, 'w') as f: + f.write("原始数据未保存,因为无法获取序列名称。\n") + f.write("这可能是因为数据加载器没有返回sequence_names字段。\n") + f.write(f"Sample index: {sample_idx}\n") + f.write(f"Generated at step: {datetime.now().isoformat()}\n") + return + + # 构建原始数据路径 + data_root_path = Path(data_root) + + # 直接使用指定的data_split来定位原始数据 + original_sample_dir = data_root_path / data_split / sequence_name + + if not original_sample_dir.exists(): + print(f"Warning: Could not find original data for {sequence_name} in {data_split} split") + # 创建一个说明文件 + error_file = sample_dir / "original_data_not_found.txt" + with open(error_file, 'w') as f: + f.write(f"原始数据未找到\n") + f.write(f"查找路径: {original_sample_dir}\n") + f.write(f"数据集: {data_split}\n") + f.write(f"序列名: {sequence_name}\n") + f.write(f"时间: {datetime.now().isoformat()}\n") + return + + print(f"Copying original data from {original_sample_dir}") + + # 创建原始数据子目录 + original_data_dir = sample_dir / "original_data" + original_data_dir.mkdir(exist_ok=True) + + # 1. 复制RGB图像(所有帧) + rgb_dir = original_data_dir / "rgb" + rgb_dir.mkdir(exist_ok=True) + original_rgb_dir = original_sample_dir / "rgb" + if original_rgb_dir.exists(): + for rgb_file in sorted(original_rgb_dir.glob("frame_*.png")): + shutil.copy2(rgb_file, rgb_dir / rgb_file.name) + + # 2. 复制深度图(优先复制合并的npz,否则复制单独的npy) + depth_dir = original_data_dir / "depth" + depth_dir.mkdir(exist_ok=True) + original_depth_dir = original_sample_dir / "depth" + if original_depth_dir.exists(): + # 检查是否有合并的npz文件 + merged_depth = original_depth_dir / "depth_merge.npz" + if merged_depth.exists(): + shutil.copy2(merged_depth, depth_dir / "depth_merge.npz") + else: + # 复制单独的npy文件 + for depth_file in sorted(original_depth_dir.glob("frame_*.npy")): + shutil.copy2(depth_file, depth_dir / depth_file.name) + + # 3. 复制分割图(优先复制合并的npz,否则复制单独的npy) + seg_dir = original_data_dir / "segmentation" + seg_dir.mkdir(exist_ok=True) + original_seg_dir = original_sample_dir / "segmentation" + if original_seg_dir.exists(): + # 检查是否有合并的npz文件 + merged_seg = original_seg_dir / "segmentation_merge.npz" + if merged_seg.exists(): + shutil.copy2(merged_seg, seg_dir / "segmentation_merge.npz") + else: + # 复制单独的npy文件 + for seg_file in sorted(original_seg_dir.glob("frame_*.npy")): + shutil.copy2(seg_file, seg_dir / seg_file.name) + + # 4. 复制法线图(优先复制合并的npz,否则复制单独的npy) + normal_dir = original_data_dir / "normal" + normal_dir.mkdir(exist_ok=True) + original_normal_dir = original_sample_dir / "normal" + if original_normal_dir.exists(): + # 检查是否有合并的npz文件 + merged_normal = original_normal_dir / "normal_merge.npz" + if merged_normal.exists(): + shutil.copy2(merged_normal, normal_dir / "normal_merge.npz") + else: + # 复制单独的npy文件 + for normal_file in sorted(original_normal_dir.glob("frame_*.npy")): + shutil.copy2(normal_file, normal_dir / normal_file.name) + + # 5. 复制相机轨迹 + camera_traj_file = original_sample_dir / "camera_trajectory.npz" + if camera_traj_file.exists(): + shutil.copy2(camera_traj_file, original_data_dir / "camera_trajectory.npz") + + # 6. 复制元数据JSON + metadata_file = original_sample_dir / "metadata.json" + if metadata_file.exists(): + shutil.copy2(metadata_file, original_data_dir / "metadata.json") + + # 7. 复制完整的训练目标数据缓存文件(Full_Sample_Data_for_Learning_Target.npz) + full_cache_file = original_sample_dir / "Full_Sample_Data_for_Learning_Target.npz" + if full_cache_file.exists(): + shutil.copy2(full_cache_file, original_data_dir / "Full_Sample_Data_for_Learning_Target.npz") + + # 8. 复制其他可能的合并文件(object_coordinates, point_clouds等) + for folder_name in ['object_coordinates', 'point_clouds']: + folder_dir = original_data_dir / folder_name + original_folder_dir = original_sample_dir / folder_name + if original_folder_dir.exists(): + folder_dir.mkdir(exist_ok=True) + # 检查是否有合并的npz文件 + merged_file = original_folder_dir / f"{folder_name}_merge.npz" + if merged_file.exists(): + shutil.copy2(merged_file, folder_dir / f"{folder_name}_merge.npz") + else: + # 复制单独的npy文件 + for npy_file in sorted(original_folder_dir.glob("frame_*.npy")): + shutil.copy2(npy_file, folder_dir / npy_file.name) + + # 9. 如果有预处理的点云数据(在batch_data中),也保存 + if 'point_clouds' in batch_data: + pc_data = batch_data['point_clouds'][sample_idx] + np.savez_compressed(original_data_dir / "point_clouds.npz", **pc_data) + + # 10. 保存场景归一化参数 + if 'scene_normalization' in batch_data: + norm_params = batch_data['scene_normalization'][sample_idx] + with open(original_data_dir / "scene_normalization.json", 'w') as f: + json.dump({ + 'scene_center': norm_params['center'].tolist() if hasattr(norm_params['center'], 'tolist') else norm_params['center'], + 'scene_scale': float(norm_params['scale']) if 'scale' in norm_params else 1.0, + 'scene_extent': float(norm_params['extent']) if 'extent' in norm_params else 1.0 + }, f, indent=2) + + # 11. 创建文件清单 + with open(original_data_dir / "file_manifest.txt", 'w') as f: + f.write(f"Original sequence: {sequence_name}\n") + f.write(f"Data split: {data_split}\n") + f.write(f"Original path: {original_sample_dir}\n") + f.write(f"Copied at: {datetime.now().isoformat()}\n\n") + f.write("Files included:\n") + for item in sorted(original_data_dir.rglob("*")): + if item.is_file() and item.name != "file_manifest.txt": + f.write(f"- {item.relative_to(original_data_dir)}\n") + + except Exception as e: + print(f"Error saving original data for sample {sample_idx}: {e}") + import traceback + traceback.print_exc() diff --git a/nano_WaveGen/utils/visualize_training.py b/nano_WaveGen/utils/visualize_training.py new file mode 100644 index 0000000000000000000000000000000000000000..bc8226b1c486608e14020883e30bb0484969bd29 --- /dev/null +++ b/nano_WaveGen/utils/visualize_training.py @@ -0,0 +1,1096 @@ +#!/usr/bin/env python3 +""" +WaveGen 训练结果可视化工具 (独立版本) +自动检索 core_space 目录并可视化训练输出 + +Usage: + cd code/WaveGen/nano_WaveGen + python utils/visualize_training.py +""" + +import numpy as np +import viser +import viser.transforms as viser_tf +from typing import Optional, Dict, List, Tuple, Any +import os +from pathlib import Path +import json +import cv2 +import time +import webbrowser +from scipy.spatial.transform import Rotation + +# 导入深度转点云模块 +try: + from depth_to_pointcloud import DepthToPointCloud +except ImportError: + # 尝试从当前目录导入 + import sys + sys.path.append(str(Path(__file__).parent)) + from depth_to_pointcloud import DepthToPointCloud + + +class TrainingVisualizer: + """WaveGen训练结果可视化器""" + + def __init__(self, core_space_dir: str = "core_space", port: int = 8080): + """ + 初始化可视化器 + + Args: + core_space_dir: core_space目录路径(相对于当前工作目录) + port: 起始端口号(如果占用会自动尝试下一个) + """ + self.core_space_dir = Path(core_space_dir) + if not self.core_space_dir.is_absolute(): + self.core_space_dir = Path.cwd() / self.core_space_dir + + # 启动Viser服务器,自动寻找可用端口 + self.server = None + self.port = port + max_attempts = 10 + + for attempt in range(max_attempts): + try_port = port + attempt + try: + # 不显示默认的config/diagnostics页,直接进入UI + self.server = viser.ViserServer(port=try_port, show_config=False) + self.port = try_port + print(f"🌐 Viser服务器已启动: http://localhost:{try_port}") + if attempt > 0: + print(f" (端口 {port} 被占用,自动使用端口 {try_port})") + break + except OSError as e: + if "Address already in use" in str(e): + if attempt == max_attempts - 1: + print(f"❌ 无法找到可用端口 (尝试了 {port}-{try_port})") + print(f" 请手动关闭其他实例: pkill -f visualize_training.py") + raise + continue + else: + raise + + # 可视化句柄 + self.superquadric_handles = [] + self.gt_superquadric_handles = [] + self.camera_handles = [] + self.camera_frustum_handles = [] + self.point_cloud_handle = None + self.camera_rgb_handle = None + self.coordinate_frame_handle = None + self.mesh_handles_pool = {} + + # 当前数据 + self.predictions_npz = None + self.targets_npz = None + self.current_sample_path = None + self.current_frame = 0 + self.original_frame_count = 0 + self.scene_center = np.array([0, 0, 0]) + self.scene_scale = 1.0 + + # GUI控件 + self.gui_controls = {} + + # 播放状态 + self.is_playing = False + + # 设置场景 + self.setup_scene() + + # 扫描训练输出 + self.scan_training_outputs() + + # 设置GUI (立即创建,不等待客户端连接) + self.setup_gui() + + print("✅ 训练可视化器已初始化") + print(f"📁 监控目录: {self.core_space_dir}") + + if len(self.training_outputs) == 0: + print("⚠️ 未找到训练输出,请检查 core_space 目录") + + def setup_scene(self): + """设置场景背景和坐标系""" + # 设置深蓝色背景 + bg_color = [13, 13, 38] + width, height = 1920, 1080 + solid_color_image = np.full((height, width, 3), bg_color, dtype=np.uint8) + self.server.scene.set_background_image(solid_color_image, format="png") + + # 设置坐标系方向 + self.server.scene.set_up_direction("+y") + + def scan_training_outputs(self): + """扫描core_space目录下的训练输出""" + self.training_outputs = [] + + if not self.core_space_dir.exists(): + print(f"⚠️ core_space目录不存在: {self.core_space_dir}") + return + + # 查找所有训练输出目录 (格式: YYYYMMDD_HHMMSS_stepN_text2wave) + # 按时间倒序(最新在前)方便默认选择最新样本 + for output_dir in sorted(self.core_space_dir.glob("*_text2wave"), reverse=True): + if output_dir.is_dir(): + # 查找样本目录 + sample_dirs = sorted(output_dir.glob("sample_*")) + if sample_dirs: + self.training_outputs.append({ + 'path': output_dir, + 'name': output_dir.name, + 'samples': len(sample_dirs) + }) + + print(f"📦 找到 {len(self.training_outputs)} 个训练输出") + for output in self.training_outputs: + print(f" - {output['name']} ({output['samples']} 样本)") + + def setup_gui(self): + """设置GUI控件""" + # 训练输出选择 + with self.server.gui.add_folder("训练输出"): + if self.training_outputs: + output_names = [out['name'] for out in self.training_outputs] + self.gui_controls['output_selector'] = self.server.gui.add_dropdown( + "选择训练输出", + options=output_names, + initial_value=output_names[0] + ) + self.gui_controls['output_selector'].on_update(self._on_output_change) + + # 样本选择 + self.gui_controls['sample_slider'] = self.server.gui.add_slider( + "样本索引", + min=0, + max=max(0, self.training_outputs[0]['samples'] - 1), + step=1, + initial_value=0 + ) + self.gui_controls['sample_slider'].on_update(self._on_sample_change) + + self.gui_controls['load_button'] = self.server.gui.add_button("加载样本") + self.gui_controls['load_button'].on_click(self._on_load_sample) + else: + self.server.gui.add_text("状态", initial_value="未找到训练输出") + + # 帧控制 + with self.server.gui.add_folder("帧控制"): + self.gui_controls['frame_slider'] = self.server.gui.add_slider( + "当前帧", + min=0, + max=23, + step=1, + initial_value=0 + ) + self.gui_controls['frame_slider'].on_update(self._on_frame_change) + + self.gui_controls['play_button'] = self.server.gui.add_button("▶ 播放") + self.gui_controls['play_button'].on_click(self._on_play) + + self.gui_controls['pause_button'] = self.server.gui.add_button("⏸ 暂停") + self.gui_controls['pause_button'].on_click(self._on_pause) + + self.gui_controls['fps_slider'] = self.server.gui.add_slider( + "播放FPS", + min=1, + max=30, + step=1, + initial_value=8 + ) + + # 生成结果控制 + with self.server.gui.add_folder("生成结果"): + self.gui_controls['show_generated'] = self.server.gui.add_checkbox( + "显示生成的超二次曲面", initial_value=True + ) + self.gui_controls['show_generated'].on_update(self._on_visibility_change) + + self.gui_controls['generated_opacity'] = self.server.gui.add_slider( + "生成结果透明度", min=0.1, max=1.0, step=0.05, initial_value=0.7 + ) + self.gui_controls['generated_opacity'].on_update(self._on_opacity_change) + + self.gui_controls['generated_color'] = self.server.gui.add_rgb( + "生成结果颜色", initial_value=(100, 149, 237) # 蓝色 + ) + self.gui_controls['generated_color'].on_update(self._on_color_change) + + # Ground Truth控制 + with self.server.gui.add_folder("Ground Truth"): + self.gui_controls['show_gt'] = self.server.gui.add_checkbox( + "显示GT超二次曲面", initial_value=True + ) + self.gui_controls['show_gt'].on_update(self._on_visibility_change) + + self.gui_controls['gt_opacity'] = self.server.gui.add_slider( + "GT透明度", min=0.1, max=1.0, step=0.05, initial_value=0.5 + ) + self.gui_controls['gt_opacity'].on_update(self._on_opacity_change) + + self.gui_controls['gt_color'] = self.server.gui.add_rgb( + "GT颜色", initial_value=(255, 99, 71) # 红色 + ) + self.gui_controls['gt_color'].on_update(self._on_color_change) + + # 点云控制 + with self.server.gui.add_folder("点云显示"): + self.gui_controls['show_pointcloud'] = self.server.gui.add_checkbox( + "显示点云", initial_value=True + ) + self.gui_controls['show_pointcloud'].on_update(self._on_visibility_change) + + self.gui_controls['pointcloud_size'] = self.server.gui.add_slider( + "点大小", min=0.001, max=0.02, step=0.001, initial_value=0.008 + ) + self.gui_controls['pointcloud_size'].on_update(self._on_visibility_change) + + # 网格质量 + with self.server.gui.add_folder("渲染设置"): + self.gui_controls['mesh_resolution'] = self.server.gui.add_slider( + "网格分辨率", min=10, max=50, step=5, initial_value=25 + ) + self.gui_controls['mesh_resolution'].on_update(self._on_mesh_resolution_change) + + self.gui_controls['show_coordinate'] = self.server.gui.add_checkbox( + "显示坐标系", initial_value=False + ) + self.gui_controls['show_coordinate'].on_update(self._on_visibility_change) + + # 相机控制 + with self.server.gui.add_folder("相机控制"): + self.gui_controls['reset_view'] = self.server.gui.add_button("重置视角") + self.gui_controls['reset_view'].on_click(self._on_reset_view) + + self.gui_controls['match_camera'] = self.server.gui.add_button("匹配GT相机") + self.gui_controls['match_camera'].on_click(self._on_match_camera) + + self.gui_controls['show_target_frustum'] = self.server.gui.add_checkbox( + "显示GT相机椎体", initial_value=True + ) + self.gui_controls['show_pred_frustum'] = self.server.gui.add_checkbox( + "显示预测相机椎体", initial_value=True + ) + self.gui_controls['show_camera_rgb'] = self.server.gui.add_checkbox( + "相机视锥显示RGB", initial_value=True + ) + self.gui_controls['show_target_frustum'].on_update(self._on_visibility_change) + self.gui_controls['show_pred_frustum'].on_update(self._on_visibility_change) + self.gui_controls['show_camera_rgb'].on_update(self._on_visibility_change) + + print(f"✅ GUI 已设置 - 创建了 {len(self.gui_controls)} 个控件") + + def _on_output_change(self, event): + """训练输出选择改变""" + selected_name = event.target.value + for i, output in enumerate(self.training_outputs): + if output['name'] == selected_name: + # 更新样本滑块范围 + max_sample = max(0, output['samples'] - 1) + self.gui_controls['sample_slider'].max = max_sample + self.gui_controls['sample_slider'].value = 0 + break + + def _on_sample_change(self, event): + """样本索引改变""" + pass # 用户需要点击"加载样本"按钮 + + def _on_load_sample(self, event): + """加载选中的样本""" + selected_name = self.gui_controls['output_selector'].value + sample_idx = int(self.gui_controls['sample_slider'].value) + + # 找到对应的训练输出 + output_path = None + for output in self.training_outputs: + if output['name'] == selected_name: + output_path = output['path'] + break + + if output_path is None: + print(f"❌ 未找到训练输出: {selected_name}") + return + + self.load_sample(output_path, sample_idx) + + def load_sample(self, output_path: Path, sample_idx: int): + """加载样本数据""" + sample_path = output_path / f"sample_{sample_idx}" + + if not sample_path.exists(): + print(f"❌ 样本目录不存在: {sample_path}") + return + + print(f"\n{'='*60}") + print(f"📂 加载样本: {output_path.name}/sample_{sample_idx}") + print(f"{'='*60}") + + self.current_sample_path = sample_path + + # 加载predictions.npz + pred_file = sample_path / "predictions.npz" + if pred_file.exists(): + npz_data = np.load(pred_file, allow_pickle=True) + self.predictions_npz = {key: npz_data[key] for key in npz_data.files} + npz_data.close() + print(f"✅ 加载predictions.npz: {pred_file}") + if 'frames' in self.predictions_npz: + print(f" 帧数: {len(self.predictions_npz['frames'])}") + if 'text' in self.predictions_npz: + print(f" 文本: {self.predictions_npz['text']}") + else: + self.predictions_npz = None + print(f"⚠️ 未找到predictions.npz") + + # 加载targets.npz + target_file = sample_path / "targets.npz" + if target_file.exists(): + npz_data = np.load(target_file, allow_pickle=True) + self.targets_npz = {key: npz_data[key] for key in npz_data.files} + npz_data.close() + print(f"✅ 加载targets.npz: {target_file}") + if 'frames' in self.targets_npz: + print(f" 帧数: {len(self.targets_npz['frames'])}") + if 'text' in self.targets_npz: + print(f" 文本: {self.targets_npz['text']}") + else: + self.targets_npz = None + print(f"⚠️ 未找到targets.npz") + + # 更新帧数 + self.original_frame_count = 0 + if self.predictions_npz and 'frames' in self.predictions_npz: + self.original_frame_count = len(self.predictions_npz['frames']) + elif self.targets_npz and 'objects' in self.targets_npz: + objects = self.targets_npz['objects'] + if hasattr(objects, 'shape') and len(objects.shape) >= 1: + self.original_frame_count = objects.shape[0] + + if self.original_frame_count > 0: + self.gui_controls['frame_slider'].max = self.original_frame_count - 1 + self.gui_controls['frame_slider'].value = 0 + self.current_frame = 0 + print(f"📊 总帧数: {self.original_frame_count}") + + # 可视化第一帧 + self.visualize_frame(0) + + def _on_frame_change(self, event): + """帧滑块改变""" + frame_idx = int(event.target.value) + self.visualize_frame(frame_idx) + + def _on_play(self, event): + """开始播放""" + self.is_playing = True + print("▶ 开始播放") + + # 在后台线程播放 + import threading + threading.Thread(target=self._playback_loop, daemon=True).start() + + def _on_pause(self, event): + """暂停播放""" + self.is_playing = False + print("⏸ 暂停播放") + + def _playback_loop(self): + """播放循环""" + while self.is_playing: + current_frame = int(self.gui_controls['frame_slider'].value) + next_frame = (current_frame + 1) % self.original_frame_count + + self.gui_controls['frame_slider'].value = next_frame + self.visualize_frame(next_frame) + + fps = int(self.gui_controls['fps_slider'].value) + time.sleep(1.0 / fps) + + def _on_visibility_change(self, event): + """可见性改变""" + self.visualize_frame(self.current_frame) + + def _on_opacity_change(self, event): + """透明度改变""" + self.visualize_frame(self.current_frame) + + def _on_color_change(self, event): + """颜色改变""" + self.visualize_frame(self.current_frame) + + def _on_mesh_resolution_change(self, event): + """网格分辨率改变""" + # 清空对象池,强制重新生成mesh + for mesh in self.mesh_handles_pool.values(): + mesh.remove() + self.mesh_handles_pool.clear() + self.visualize_frame(self.current_frame) + + def _on_reset_view(self, event): + """重置视角""" + # 设置默认相机位置 + for client in self.server.get_clients().values(): + client.camera.position = (3.0, 2.0, 3.0) + client.camera.look_at = (0.0, 0.0, 0.0) + + def _on_match_camera(self, event): + """匹配GT相机视角 (新格式)""" + if self.targets_npz is None or 'frames' not in self.targets_npz: + print("⚠️ 没有GT相机数据") + return + + frame_idx = self.current_frame + frames = self.targets_npz['frames'] + + if frame_idx >= len(frames): + print("⚠️ 帧索引超出范围") + return + + frame_data = frames[frame_idx] + + # 转换为字典格式 + if isinstance(frame_data, np.ndarray): + frame_data = frame_data.item() + + if 'world_info' not in frame_data: + print("⚠️ 未找到world_info数据") + return + + world_info = frame_data['world_info'] + camera_position = world_info['camera_position'] + # 存储的是xyzw,需要转成viser的wxyz + q_xyzw = np.array(world_info['camera_quaternion'], dtype=np.float32) + wxyz = (float(q_xyzw[3]), float(q_xyzw[0]), float(q_xyzw[1]), float(q_xyzw[2])) + + # 对齐到可视化坐标系:减去scene_center并乘以scene_scale + cam_pos_vis = (np.array(camera_position) - self.scene_center) * self.scene_scale + + print(f"📷 匹配相机: pos={camera_position}, quat={wxyz}") + + # 设置所有客户端的相机 + for client in self.server.get_clients().values(): + client.camera.position = tuple(cam_pos_vis) + client.camera.wxyz = wxyz + + def visualize_frame(self, frame_idx: int): + """可视化指定帧""" + if self.original_frame_count <= 0: + return + + frame_idx = int(np.clip(frame_idx, 0, self.original_frame_count - 1)) + self.current_frame = frame_idx + + print(f"\n🎨 可视化帧 {frame_idx}/{self.original_frame_count-1}") + + # 清空旧的可视化 + self.clear_visualization() + + # 获取GUI参数 + show_generated = self.gui_controls['show_generated'].value + show_gt = self.gui_controls['show_gt'].value + show_pointcloud = self.gui_controls['show_pointcloud'].value + show_coordinate = self.gui_controls['show_coordinate'].value + + generated_opacity = self.gui_controls['generated_opacity'].value + gt_opacity = self.gui_controls['gt_opacity'].value + generated_color = tuple(self.gui_controls['generated_color'].value) + gt_color = tuple(self.gui_controls['gt_color'].value) + mesh_resolution = int(self.gui_controls['mesh_resolution'].value) + + # 提取帧数据 + predictions = self._extract_predictions(frame_idx) + targets = self._extract_targets(frame_idx) + + # 场景对齐信息:优先使用scene_normalization.json或GT world_info的center/scale;超二次体保持原坐标,点云/相机用该center/scale + self.scene_center = np.zeros(3, dtype=np.float32) + self.scene_scale = 1.0 + + norm_path = None + if self.current_sample_path is not None: + norm_path = self.current_sample_path / "original_data" / "scene_normalization.json" + loaded_norm = False + if norm_path is not None and norm_path.exists(): + try: + with open(norm_path) as f: + norm = json.load(f) + if 'scene_center' in norm: + self.scene_center = np.array(norm['scene_center'], dtype=np.float32) + if 'scene_scale' in norm: + self.scene_scale = float(norm['scene_scale']) + elif 'scene_extent' in norm and norm['scene_extent']: + self.scene_scale = 20.0 / float(norm['scene_extent']) + loaded_norm = True + except Exception: + loaded_norm = False + + if not loaded_norm: + wi = self._get_world_info(frame_idx, source="targets") + if wi is not None: + if 'scene_center' in wi: + self.scene_center = np.array(wi['scene_center'], dtype=np.float32) + if 'scene_scale' in wi: + try: + self.scene_scale = float(wi['scene_scale']) + except Exception: + pass + + if show_pointcloud: + # 点云用同一center/scale做归一化 + self._visualize_pointcloud(frame_idx, scene_center=self.scene_center, scene_scale=self.scene_scale) + + # 可视化生成的超二次曲面 + if show_generated and predictions is not None: + self._visualize_superquadrics( + predictions, + color=generated_color, + opacity=generated_opacity, + mesh_resolution=mesh_resolution, + is_gt=False + ) + + # 可视化GT超二次曲面 + if show_gt and targets is not None: + self._visualize_superquadrics( + targets, + color=gt_color, + opacity=gt_opacity, + mesh_resolution=mesh_resolution, + is_gt=True + ) + + # 显示坐标系 + if show_coordinate: + self.coordinate_frame_handle = self.server.scene.add_frame( + "/coordinate", + wxyz=(1, 0, 0, 0), + position=(0, 0, 0), + axes_length=1.0, + axes_radius=0.01 + ) + + # 可视化相机椎体/RGB + self._visualize_cameras(frame_idx) + + def _extract_predictions(self, frame_idx: int) -> Optional[np.ndarray]: + """提取预测数据 (新格式)""" + if self.predictions_npz is None or 'frames' not in self.predictions_npz: + return None + + frames = self.predictions_npz['frames'] + if frame_idx >= len(frames): + return None + + frame_data = frames[frame_idx] + + # 转换为字典格式 + if isinstance(frame_data, np.ndarray): + frame_data = frame_data.item() + + if 'superquadrics' not in frame_data: + return None + + superquadrics = frame_data['superquadrics'] + objects_array = [] + + for sq in superquadrics: + # 转换为数组格式 [15 params] + obj_params = np.zeros(15, dtype=np.float32) + obj_params[0] = 1.0 if sq['exists'] else 0.0 + obj_params[1:3] = sq['shape'] # epsilon1, epsilon2 + obj_params[3:6] = sq['scale'] # a, b, c + obj_params[6:9] = sq['translation'] # x, y, z + obj_params[9:12] = sq['rotation'] # euler angles + obj_params[12:15] = sq['velocity'] # vx, vy, vz + objects_array.append(obj_params) + + return np.array(objects_array, dtype=np.float32) + + def _extract_targets(self, frame_idx: int) -> Optional[np.ndarray]: + """提取GT数据 (新格式)""" + if self.targets_npz is None or 'frames' not in self.targets_npz: + return None + + frames = self.targets_npz['frames'] + if frame_idx >= len(frames): + return None + + frame_data = frames[frame_idx] + + # 转换为字典格式 + if isinstance(frame_data, np.ndarray): + frame_data = frame_data.item() + + if 'superquadrics' not in frame_data: + return None + + superquadrics = frame_data['superquadrics'] + objects_array = [] + + for sq in superquadrics: + # 转换为数组格式 [16 params - GT格式包含 inlier_ratio] + obj_params = np.zeros(16, dtype=np.float32) + obj_params[0] = 1.0 if sq['exists'] else 0.0 + obj_params[1:3] = sq['shape'] # epsilon1, epsilon2 + obj_params[3:6] = sq['scale'] # a, b, c + obj_params[6:9] = sq['translation'] # x, y, z + obj_params[9:12] = sq['rotation'] # euler angles + obj_params[12] = sq['inlier_ratio'] # inlier ratio (GT specific) + obj_params[13:16] = sq['velocity'] # vx, vy, vz + objects_array.append(obj_params) + + return np.array(objects_array, dtype=np.float32) + + def _visualize_superquadrics(self, objects: np.ndarray, color: Tuple, + opacity: float, mesh_resolution: int, is_gt: bool): + """可视化超二次曲面""" + prefix = "gt" if is_gt else "gen" + num_active = 0 + + for obj_idx, obj_params in enumerate(objects): + # 检查存在标志 + if obj_params[0] > 0.5: + num_active += 1 + + try: + # 生成mesh + vertices, faces = self.generate_superquadric_mesh( + obj_params, num_samples=mesh_resolution + ) + + # 使用对象池 + mesh_key = f"{prefix}_{obj_idx}" + mesh = self.get_or_create_mesh( + mesh_key, vertices, faces, color, opacity + ) + + if is_gt: + self.gt_superquadric_handles.append(mesh) + else: + self.superquadric_handles.append(mesh) + + except Exception as e: + print(f"❌ 可视化对象{obj_idx}失败: {e}") + + label = "GT" if is_gt else "生成" + print(f" {label}对象数: {num_active}") + + def _visualize_pointcloud(self, frame_idx: int, scene_center: Optional[np.ndarray] = None, scene_scale: Optional[float] = None): + """可视化点云""" + if self.current_sample_path is None: + return + + # 查找原始数据 + original_data_dir = self.current_sample_path / "original_data" + if not original_data_dir.exists(): + print("⚠️ 未找到original_data目录") + return + + # 加载深度图和RGB + depth_file = self._find_depth_file(original_data_dir, frame_idx) + rgb_file = original_data_dir / "rgb" / f"frame_{frame_idx:03d}.png" + + if depth_file is None or not rgb_file.exists(): + print(f"⚠️ 未找到帧{frame_idx}的深度图或RGB") + return + + try: + # 加载数据 + depth = self._load_depth(depth_file, frame_idx) + if depth.ndim == 2: + depth = depth[:, :, None] + rgb = self._load_rgb(rgb_file) + + # 加载相机内参 + camera_K = None + metadata_file = original_data_dir / "metadata.json" + if metadata_file.exists(): + with open(metadata_file) as f: + metadata = json.load(f) + if 'camera' in metadata and 'K' in metadata['camera']: + camera_K = np.array(metadata['camera']['K'], dtype=np.float32) + if camera_K is None: + h, w = depth.shape[:2] + camera_K = np.array([[w, 0, w/2], [0, h, h/2], [0, 0, 1]], dtype=np.float32) + + # 使用GT world_info作为相机位姿 + world_info = self._get_world_info(frame_idx, source="targets") + camera_position = np.zeros(3, dtype=np.float32) + camera_quat_xyzw = np.array([0, 0, 0, 1], dtype=np.float32) + if world_info is not None and 'camera_position' in world_info: + camera_position = np.array(world_info['camera_position'], dtype=np.float32) + if 'camera_quaternion' in world_info: + # 存储为xyzw,直接使用 + camera_quat_xyzw = np.array(world_info['camera_quaternion'], dtype=np.float32) + + # 转换为点云(标准化到[-10,10]),使用提供的center/scale以与训练对齐 + converter = DepthToPointCloud() + _, points_norm, _, depth_center, depth_extent = converter.depth_to_normalized_pointcloud_movi( + depth=depth, + segmentation=None, + camera_K=camera_K, + camera_position=camera_position, + camera_quaternion=camera_quat_xyzw, + resolution=depth.shape[0], + convert_to_zdepth=True, + scene_center_override=scene_center, + scene_scale_override=scene_scale + ) + + valid_mask = depth[:, :, 0] > 0 + points = points_norm[valid_mask] + colors = rgb.reshape(-1, 3)[valid_mask.reshape(-1)] + + # 更新场景中心和尺度(供相机椎体使用) + if scene_center is not None and scene_scale is not None: + self.scene_center = np.array(scene_center, dtype=np.float32) + self.scene_scale = float(scene_scale) + else: + self.scene_center = depth_center + self.scene_scale = 20.0 / max(depth_extent, 1e-6) + + # 显示点云 + point_size = self.gui_controls['pointcloud_size'].value + self.point_cloud_handle = self.server.scene.add_point_cloud( + "/pointcloud", + points=points, + colors=colors, + point_size=point_size + ) + + print(f" 点云: {len(points)} 个点") + + except Exception as e: + print(f"❌ 加载点云失败: {e}") + + def _find_depth_file(self, original_data_dir: Path, frame_idx: int) -> Optional[Path]: + """查找深度文件(支持合并的npz和单独的npy)""" + depth_dir = original_data_dir / "depth" + if not depth_dir.exists(): + return None + + # 检查合并的npz + merged_npz = depth_dir / "depth_merge.npz" + if merged_npz.exists(): + return merged_npz + + # 检查单独的npy + npy_file = depth_dir / f"frame_{frame_idx:03d}.npy" + if npy_file.exists(): + return npy_file + + return None + + def _load_depth(self, depth_file: Path, frame_idx: int) -> np.ndarray: + """加载深度数据""" + if depth_file.suffix == '.npz': + # 从合并的npz加载 + data = np.load(depth_file) + frame_key = f"frame_{frame_idx:03d}" + return data[frame_key] + else: + # 从单独的npy加载 + return np.load(depth_file) + + def _load_rgb(self, rgb_path: Path) -> np.ndarray: + """加载RGB图像""" + img = cv2.imread(str(rgb_path)) + if img is None: + raise FileNotFoundError(f"Failed to load RGB image: {rgb_path}") + return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + def _get_world_info(self, frame_idx: int, source: str = "targets") -> Optional[Dict[str, np.ndarray]]: + """从pred/target获取世界/相机信息""" + data = self.targets_npz if source == "targets" else self.predictions_npz + if data is None: + return None + + if 'frames' in data: + frames = data['frames'] + if frame_idx < len(frames): + entry = frames[frame_idx] + if hasattr(entry, 'item'): + try: + entry = entry.item() + except Exception: + pass + if isinstance(entry, dict) and 'world_info' in entry: + return entry['world_info'] + + # 兼容旧格式 world 张量 + if 'world' in data: + world = data['world'] + if hasattr(world, 'shape') and world.shape[0] > frame_idx and world.shape[-1] >= 7: + wp = world[frame_idx] + scene_center = world[frame_idx, 8:11] if world.shape[-1] >= 11 else np.zeros(3, dtype=np.float32) + return { + 'camera_position': wp[:3], + 'camera_quaternion': wp[3:7], + 'scene_scale': float(wp[7]) if len(wp) > 7 else 1.0, + 'scene_center': scene_center, + } + return None + + def _visualize_cameras(self, frame_idx: int): + """可视化相机椎体与RGB""" + for h in self.camera_frustum_handles: + h.remove() + self.camera_frustum_handles = [] + if self.camera_rgb_handle is not None: + self.camera_rgb_handle.remove() + self.camera_rgb_handle = None + + show_target = self.gui_controls.get('show_target_frustum', None) + show_pred = self.gui_controls.get('show_pred_frustum', None) + show_rgb = self.gui_controls.get('show_camera_rgb', None) + if show_target is None or show_pred is None or show_rgb is None: + return + if not (show_target.value or show_pred.value): + return + + original_data_dir = None + rgb_image = None + if show_rgb.value and self.current_sample_path is not None: + original_data_dir = self.current_sample_path / "original_data" + if original_data_dir.exists(): + rgb_path = original_data_dir / "rgb" / f"frame_{frame_idx:03d}.png" + if rgb_path.exists(): + try: + rgb_image = self._load_rgb(rgb_path) + except Exception: + rgb_image = None + + # FOV估计 + fov = np.deg2rad(60.0) + aspect = 1.0 + if rgb_image is not None: + h, w = rgb_image.shape[:2] + aspect = w / max(h, 1) + metadata_file = (self.current_sample_path / "original_data" / "metadata.json") if self.current_sample_path else None + fx = None + if metadata_file and metadata_file.exists(): + try: + with open(metadata_file) as f: + metadata = json.load(f) + if 'camera' in metadata and 'K' in metadata['camera']: + K = np.array(metadata['camera']['K'], dtype=np.float32) + fx = K[0, 0] + except Exception: + fx = None + if fx is not None and w > 0: + fov = 2 * np.arctan(w / (2 * fx)) + + def add_frustum(world_info: Dict, name: str, color: Tuple[int, int, int]): + if world_info is None: + return + cam_pos = np.array(world_info.get('camera_position', np.zeros(3)), dtype=np.float32) + cam_quat = np.array(world_info.get('camera_quaternion', [0, 0, 0, 1]), dtype=np.float32) # xyzw + if cam_quat.shape[0] == 4: + wxyz = (float(cam_quat[3]), float(cam_quat[0]), float(cam_quat[1]), float(cam_quat[2])) + else: + wxyz = (1.0, 0.0, 0.0, 0.0) + + # 将位置移到可视化坐标系(减去中心再按场景尺度缩放) + pos = (cam_pos - self.scene_center) * getattr(self, "scene_scale", 1.0) + + frustum = self.server.scene.add_camera_frustum( + f"/{name}", + fov=fov, + aspect=aspect, + scale=2.0, + wxyz=wxyz, + position=pos, + image=rgb_image if show_rgb.value else None, + color=tuple(int(c) for c in color) + ) + self.camera_frustum_handles.append(frustum) + + if show_pred.value: + add_frustum(self._get_world_info(frame_idx, source="predictions"), "pred_camera_frustum", (100, 149, 237)) + if show_target.value: + add_frustum(self._get_world_info(frame_idx, source="targets"), "gt_camera_frustum", (255, 99, 71)) + + def generate_superquadric_mesh(self, params, num_samples=25): + """生成超二次曲面mesh""" + # 解析参数 + epsilon = [params[1], params[2]] + scale = [params[3], params[4], params[5]] + translation = [params[6], params[7], params[8]] + rotation = [params[9], params[10], params[11]] if len(params) >= 12 else [0, 0, 0] + + # 生成参数网格 + eta = np.linspace(-np.pi/2, np.pi/2, num_samples) + omega = np.linspace(-np.pi, np.pi, num_samples) + + vertices = [] + faces = [] + + # 生成旋转矩阵 + rot = Rotation.from_euler('ZYX', rotation) + rot_matrix = rot.as_matrix() + + # 生成顶点 + for i, e in enumerate(eta): + for j, w in enumerate(omega): + # 超二次曲面参数方程 + cos_eta = np.sign(np.cos(e)) * np.abs(np.cos(e))**epsilon[0] + sin_eta = np.sign(np.sin(e)) * np.abs(np.sin(e))**epsilon[0] + cos_omega = np.sign(np.cos(w)) * np.abs(np.cos(w))**epsilon[1] + sin_omega = np.sign(np.sin(w)) * np.abs(np.sin(w))**epsilon[1] + + # 局部坐标 + x_local = scale[0] * cos_eta * cos_omega + y_local = scale[1] * cos_eta * sin_omega + z_local = scale[2] * sin_eta + + # 应用旋转和平移 + point_local = np.array([x_local, y_local, z_local]) + point_global = rot_matrix @ point_local + np.array(translation) + + vertices.append(point_global) + + vertices = np.array(vertices) + + # 生成面片 + for i in range(num_samples - 1): + for j in range(num_samples - 1): + idx1 = i * num_samples + j + idx2 = i * num_samples + (j + 1) % num_samples + idx3 = (i + 1) * num_samples + j + idx4 = (i + 1) * num_samples + (j + 1) % num_samples + + faces.append([idx1, idx2, idx3]) + faces.append([idx2, idx4, idx3]) + + return vertices, np.array(faces) + + def get_or_create_mesh(self, key: str, vertices, faces, color, opacity): + """获取或创建mesh(对象池)""" + if key in self.mesh_handles_pool: + mesh = self.mesh_handles_pool[key] + mesh.vertices = vertices + mesh.vertex_colors = None + mesh.wireframe = False + mesh.opacity = opacity + mesh.visible = True + + # 更新颜色 + color_array = np.array(color, dtype=np.uint8) + if color_array.max() <= 1.0: + color_array = (color_array * 255).astype(np.uint8) + mesh.color = tuple(color_array) + else: + # 创建新mesh + color_array = np.array(color, dtype=np.uint8) + if color_array.max() <= 1.0: + color_array = (color_array * 255).astype(np.uint8) + + mesh = self.server.scene.add_mesh_simple( + name=f"/mesh_{key}", + vertices=vertices, + faces=faces, + color=tuple(color_array), + opacity=opacity, + wireframe=False, + flat_shading=False + ) + self.mesh_handles_pool[key] = mesh + + return mesh + + def clear_visualization(self): + """清空可视化""" + # 隐藏所有mesh + for mesh in self.mesh_handles_pool.values(): + mesh.visible = False + + # 清空句柄列表 + self.superquadric_handles = [] + self.gt_superquadric_handles = [] + + # 删除点云 + if self.point_cloud_handle is not None: + self.point_cloud_handle.remove() + self.point_cloud_handle = None + + # 删除相机椎体/RGB + for handle in self.camera_frustum_handles: + handle.remove() + self.camera_frustum_handles = [] + if self.camera_rgb_handle is not None: + self.camera_rgb_handle.remove() + self.camera_rgb_handle = None + + # 删除坐标系 + if self.coordinate_frame_handle is not None: + self.coordinate_frame_handle.remove() + self.coordinate_frame_handle = None + + def run(self, auto_open_browser: bool = True): + """运行可视化器""" + print("\n" + "="*60) + print("🎨 WaveGen 训练可视化器") + print("="*60) + print(f"📁 监控目录: {self.core_space_dir}") + print(f"🌐 Web界面: http://localhost:{self.port}") + print("="*60) + print("\n💡 提示:") + print(" - 如果页面空白一直加载,请刷新浏览器 (Ctrl+Shift+R)") + print(" - 建议使用 Chrome 或 Firefox 浏览器") + print("\n按 Ctrl+C 退出\n") + + # 自动打开浏览器 + if auto_open_browser: + url = f"http://localhost:{self.port}" + print(f"🌐 正在打开浏览器: {url}") + try: + webbrowser.open(url) + except Exception as e: + print(f"⚠️ 无法自动打开浏览器: {e}") + print(f" 请手动访问: {url}") + + try: + while True: + time.sleep(0.1) + except KeyboardInterrupt: + print("\n👋 再见!") + print("正在关闭服务器...") + # 清理资源 + try: + for mesh in self.mesh_handles_pool.values(): + mesh.remove() + except: + pass + + +def main(): + """主函数""" + import argparse + + parser = argparse.ArgumentParser(description="WaveGen训练结果可视化工具") + parser.add_argument( + '--core-space', + type=str, + default='core_space', + help='core_space目录路径(默认: ./core_space)' + ) + parser.add_argument( + '--port', + type=int, + default=8080, + help='Viser服务器端口(默认: 8080,如果被占用会自动尝试下一个端口)' + ) + parser.add_argument( + '--no-browser', + action='store_true', + help='不自动打开浏览器' + ) + + args = parser.parse_args() + + # 创建并运行可视化器 + visualizer = TrainingVisualizer(core_space_dir=args.core_space, port=args.port) + visualizer.run(auto_open_browser=not args.no_browser) + + +if __name__ == "__main__": + main() diff --git a/nano_WaveGen/visualize.sh b/nano_WaveGen/visualize.sh new file mode 100644 index 0000000000000000000000000000000000000000..d0f5cdfb788c058d35c1fd87ff17f96e17e0f14f --- /dev/null +++ b/nano_WaveGen/visualize.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# WaveGen 训练可视化工具启动脚本 + +# 脚本所在目录 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +echo "============================================================" +echo "🎨 WaveGen 训练可视化工具" +echo "============================================================" +echo "" + +# 检查依赖 +echo "检查依赖..." +python -c "import viser" 2>/dev/null +if [ $? -ne 0 ]; then + echo "❌ 缺少 viser 包" + echo "请安装: pip install viser" + exit 1 +fi + +python -c "import scipy" 2>/dev/null +if [ $? -ne 0 ]; then + echo "❌ 缺少 scipy 包" + echo "请安装: pip install scipy" + exit 1 +fi + +echo "✅ 依赖检查通过" +echo "" + +# 检查 core_space 目录 +CORE_SPACE="${1:-core_space}" + +# 如果提供了相对路径,转换为绝对路径 +if [[ ! "$CORE_SPACE" = /* ]]; then + CORE_SPACE="$SCRIPT_DIR/$CORE_SPACE" +fi + +# 尝试在多个位置查找 core_space +if [ ! -d "$CORE_SPACE" ]; then + echo "⚠️ 未找到: $CORE_SPACE" + echo "尝试其他位置..." + + # 尝试 WaveGen_Augustus_v1/core_space + ALT_PATH="$SCRIPT_DIR/../WaveGen_Augustus_v1/core_space" + if [ -d "$ALT_PATH" ]; then + CORE_SPACE="$ALT_PATH" + echo "✅ 找到: $CORE_SPACE" + else + echo "❌ 未找到 core_space 目录" + echo "" + echo "请指定 core_space 目录:" + echo " ./visualize.sh /path/to/core_space" + echo "" + echo "或者确保以下位置之一存在:" + echo " - ./core_space" + echo " - ../WaveGen_Augustus_v1/core_space" + exit 1 + fi +fi + +echo "📁 使用 core_space: $CORE_SPACE" +echo "" + +# 启动可视化工具 +echo "🚀 启动可视化工具..." +echo "" + +python utils/visualize_training.py --core-space "$CORE_SPACE"