Spaces:
Build error
Build error
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| # SPDX-License-Identifier: Apache-2.0 | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import time | |
| import torch | |
| from torch import Tensor | |
| from cosmos_predict1.diffusion.training.callbacks.every_n import EveryN | |
| from cosmos_predict1.utils import log | |
| from cosmos_predict1.utils.distributed import rank0_only | |
| from cosmos_predict1.utils.model import Model | |
| from cosmos_predict1.utils.trainer import Trainer | |
| class IterSpeed(EveryN): | |
| """ | |
| Args: | |
| hit_thres (int): Number of iterations to wait before logging. | |
| """ | |
| def __init__(self, *args, hit_thres: int = 5, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| self.time = None | |
| self.hit_counter = 0 | |
| self.hit_thres = hit_thres | |
| self.name = self.__class__.__name__ | |
| self.last_hit_time = time.time() | |
| def on_training_step_end( | |
| self, | |
| model: Model, | |
| data_batch: dict[str, torch.Tensor], | |
| output_batch: dict[str, torch.Tensor], | |
| loss: torch.Tensor, | |
| iteration: int = 0, | |
| ) -> None: | |
| if self.hit_counter < self.hit_thres: | |
| log.info( | |
| f"Iteration {iteration}: " | |
| f"Hit counter: {self.hit_counter + 1}/{self.hit_thres} | " | |
| f"Loss: {loss.item():.4f} | " | |
| f"Time: {time.time() - self.last_hit_time:.2f}s" | |
| ) | |
| self.hit_counter += 1 | |
| self.last_hit_time = time.time() | |
| #! useful for large scale training and avoid oom crash in the first two iterations!!! | |
| torch.cuda.synchronize() | |
| return | |
| super().on_training_step_end(model, data_batch, output_batch, loss, iteration) | |
| def every_n_impl( | |
| self, | |
| trainer: Trainer, | |
| model: Model, | |
| data_batch: dict[str, Tensor], | |
| output_batch: dict[str, Tensor], | |
| loss: Tensor, | |
| iteration: int, | |
| ) -> None: | |
| if self.time is None: | |
| self.time = time.time() | |
| return | |
| cur_time = time.time() | |
| iter_speed = (cur_time - self.time) / self.every_n / self.step_size | |
| log.info(f"{iteration} : iter_speed {iter_speed:.2f} seconds per iteration | Loss: {loss.item():.4f}") | |
| self.time = cur_time | |