elungky's picture
Initial commit for new Space - pre-built Docker image
28451f7
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file contains a basic configuration for video2video experiments.
"""
from hydra.core.config_store import ConfigStore
from cosmos_predict1.autoregressive.configs.base.model_config import create_video2world_model
from cosmos_predict1.autoregressive.configs.base.model_parallel import create_model_parallel_config
from cosmos_predict1.utils import log
from cosmos_predict1.utils.lazy_config import LazyDict
cs = ConfigStore.instance()
"""
Finetune 4B model with TP=1, pytorch backend, low resolution tealrobot data, frames 33, chunk 33.
Usage:
torchrun --nproc_per_node=1 -m cosmos_predict1.autoregressive.train --config=cosmos_predict1/autoregressive/configs/config.py -- experiment=base_4b_example_tealrobotsmall_tp1
"""
base_4b_example_tealrobotsmall_tp1: LazyDict = LazyDict(
dict(
defaults=[
{"override /data_train": "tealrobot_video_small"},
{
"override /callbacks": [
"basic",
"video_teacher_forcing",
]
},
{"override /checkpoint": "local"},
{"override /optimizer": "fused_adamw"},
{"override /scheduler": "warmup_cosine_lr"},
"_self_",
],
job=dict(
project="posttraining",
group="autoregressive_base",
name="base_4b_example_tealrobotsmall_tp1",
),
model=create_video2world_model(
model_size="4b",
model_family="cosmos",
backend="pytorch",
tensor_model_parallel_size=1,
batch_size=1,
pixel_chunk_duration=33,
num_video_frames=33,
video_height=384,
video_width=640,
tokenizer_ckpt_path="checkpoints/Cosmos-Tokenize1-DV8x16x16-720p/ema.jit",
add_special_tokens=False,
),
trainer=dict(
max_iter=50000,
grad_accum_iter=1,
grad_scaler_args=dict(enabled=False),
run_validation=False, # No need for validation as epoch <= 1
distributed_parallelism="ddp",
callbacks=dict(
vid_sampling_tf=dict(
every_n=500,
),
),
),
checkpoint=dict(
load_path="checkpoints/Cosmos-Predict1-4B/model.pt",
load_training_state=False,
strict_resume=True,
save_iter=1000,
),
model_parallel=create_model_parallel_config(),
),
)
"""
Finetune 4B model with TP=4, pytorch backend, high resolution tealrobot data, frame 33, chunk 33.
Usage:
torchrun --nproc_per_node=4 -m cosmos_predict1.autoregressive.train --config=cosmos_predict1/autoregressive/configs/config.py -- experiment=base_4b_example_tealrobot_tp4
"""
base_4b_example_tealrobot_tp4: LazyDict = LazyDict(
dict(
defaults=[
{"override /data_train": "tealrobot_video"},
{
"override /callbacks": [
"basic",
"video_teacher_forcing",
]
},
{"override /checkpoint": "local"},
{"override /optimizer": "fused_adamw"},
{"override /scheduler": "warmup_cosine_lr"},
"_self_",
],
job=dict(
project="posttraining",
group="autoregressive_base",
name="base_4b_example_tealrobot_tp4",
),
model=create_video2world_model(
model_size="4b",
model_family="cosmos",
backend="pytorch",
tensor_model_parallel_size=4,
batch_size=1,
pixel_chunk_duration=33,
num_video_frames=33,
video_height=640,
video_width=848,
tokenizer_ckpt_path="checkpoints/Cosmos-Tokenize1-DV8x16x16-720p/ema.jit",
add_special_tokens=False,
),
trainer=dict(
max_iter=50000,
grad_accum_iter=1,
grad_scaler_args=dict(enabled=False),
run_validation=False, # No need for validation as epoch <= 1
distributed_parallelism="ddp",
callbacks=dict(
vid_sampling_tf=dict(
every_n=500,
),
),
),
checkpoint=dict(
load_path="checkpoints/Cosmos-Predict1-4B/model.pt",
load_training_state=False,
strict_resume=False,
save_iter=1000,
),
model_parallel=create_model_parallel_config(),
),
)
def register_experiments(cs):
# Register the experiments
for _item in [
base_4b_example_tealrobotsmall_tp1,
base_4b_example_tealrobot_tp4,
]:
cs.store(
group="experiment",
package="_global_",
name=_item["job"]["name"],
node=_item,
)