Spaces:
Runtime error
Runtime error
| """ | |
| GPU Optimization Module for DittoTalkingHead | |
| Implements Mixed Precision, CUDA optimizations, and torch.compile | |
| """ | |
| import torch | |
| from torch.cuda.amp import autocast, GradScaler | |
| from typing import Optional, Dict, Any, Callable | |
| import os | |
| class GPUOptimizer: | |
| """ | |
| GPU optimization settings and utilities for maximum performance | |
| """ | |
| def __init__(self, device: str = "cuda"): | |
| """ | |
| Initialize GPU optimizer | |
| Args: | |
| device: Device to use (cuda/cpu) | |
| """ | |
| self.device = torch.device(device if torch.cuda.is_available() else "cpu") | |
| self.use_cuda = torch.cuda.is_available() | |
| # Mixed Precision設定 | |
| self.use_amp = True | |
| self.scaler = GradScaler() if self.use_cuda else None | |
| # PyTorch 2.0 compile最適化モード | |
| self.compile_mode = "max-autotune" # 最大の最適化 | |
| # CUDA最適化を適用 | |
| if self.use_cuda: | |
| self._setup_cuda_optimizations() | |
| def _setup_cuda_optimizations(self): | |
| """CUDA最適化設定を適用""" | |
| # CuDNN最適化 | |
| torch.backends.cudnn.benchmark = True | |
| torch.backends.cudnn.deterministic = False | |
| # TensorFloat-32 (TF32) を有効化 | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| torch.backends.cudnn.allow_tf32 = True | |
| # 行列乗算の精度設定(TF32 TensorCore活用) | |
| torch.set_float32_matmul_precision("high") | |
| # メモリ割り当ての最適化 | |
| if hasattr(torch.cuda, 'set_per_process_memory_fraction'): | |
| # GPUメモリの90%まで使用可能に設定 | |
| torch.cuda.set_per_process_memory_fraction(0.9) | |
| # CUDAグラフのキャッシュサイズを増やす | |
| os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512' | |
| print("✅ CUDA optimizations applied:") | |
| print(f" - CuDNN benchmark: {torch.backends.cudnn.benchmark}") | |
| print(f" - TF32 enabled: {torch.backends.cuda.matmul.allow_tf32}") | |
| print(f" - Matmul precision: high") | |
| def optimize_model(self, model: torch.nn.Module, use_compile: bool = True) -> torch.nn.Module: | |
| """ | |
| モデルに最適化を適用 | |
| Args: | |
| model: 最適化するモデル | |
| use_compile: torch.compileを使用するか | |
| Returns: | |
| 最適化されたモデル | |
| """ | |
| model = model.to(self.device) | |
| # torch.compile最適化(PyTorch 2.0+) | |
| if use_compile and hasattr(torch, 'compile'): | |
| try: | |
| model = torch.compile( | |
| model, | |
| mode=self.compile_mode, | |
| backend="inductor", | |
| fullgraph=True | |
| ) | |
| print(f"✅ Model compiled with mode='{self.compile_mode}'") | |
| except Exception as e: | |
| print(f"⚠️ torch.compile failed: {e}") | |
| print("Continuing without compilation...") | |
| return model | |
| def process_batch_optimized( | |
| self, | |
| model: torch.nn.Module, | |
| audio_batch: torch.Tensor, | |
| image_batch: torch.Tensor, | |
| use_amp: Optional[bool] = None | |
| ) -> torch.Tensor: | |
| """ | |
| 最適化されたバッチ処理 | |
| Args: | |
| model: 使用するモデル | |
| audio_batch: 音声バッチ | |
| image_batch: 画像バッチ | |
| use_amp: Mixed Precisionを使用するか(Noneの場合デフォルト設定を使用) | |
| Returns: | |
| 処理結果 | |
| """ | |
| if use_amp is None: | |
| use_amp = self.use_amp and self.use_cuda | |
| # Pinned Memory使用(CPU→GPU転送の高速化) | |
| if self.use_cuda and audio_batch.device.type == 'cpu': | |
| audio_batch = audio_batch.pin_memory().to(self.device, non_blocking=True) | |
| image_batch = image_batch.pin_memory().to(self.device, non_blocking=True) | |
| else: | |
| audio_batch = audio_batch.to(self.device) | |
| image_batch = image_batch.to(self.device) | |
| # Mixed Precision推論 | |
| if use_amp: | |
| with autocast(): | |
| output = model(audio_batch, image_batch) | |
| else: | |
| output = model(audio_batch, image_batch) | |
| return output | |
| def get_memory_stats(self) -> Dict[str, Any]: | |
| """ | |
| GPUメモリ統計を取得 | |
| Returns: | |
| メモリ使用状況 | |
| """ | |
| if not self.use_cuda: | |
| return {"cuda_available": False} | |
| return { | |
| "cuda_available": True, | |
| "device": str(self.device), | |
| "allocated_memory_mb": torch.cuda.memory_allocated(self.device) / 1024 / 1024, | |
| "reserved_memory_mb": torch.cuda.memory_reserved(self.device) / 1024 / 1024, | |
| "max_memory_mb": torch.cuda.max_memory_allocated(self.device) / 1024 / 1024, | |
| } | |
| def clear_cache(self): | |
| """GPUキャッシュをクリア""" | |
| if self.use_cuda: | |
| torch.cuda.empty_cache() | |
| torch.cuda.synchronize() | |
| def create_cuda_stream(self) -> Optional[torch.cuda.Stream]: | |
| """ | |
| CUDA Streamを作成(並列処理用) | |
| Returns: | |
| CUDA Stream(CUDAが利用できない場合はNone) | |
| """ | |
| if self.use_cuda: | |
| return torch.cuda.Stream() | |
| return None | |
| def get_optimization_summary(self) -> str: | |
| """ | |
| 最適化設定のサマリーを取得 | |
| Returns: | |
| 最適化設定の説明 | |
| """ | |
| if not self.use_cuda: | |
| return "GPU not available. Running on CPU." | |
| summary = f""" | |
| === GPU最適化設定 === | |
| デバイス: {self.device} | |
| Mixed Precision (AMP): {'有効' if self.use_amp else '無効'} | |
| torch.compile mode: {self.compile_mode} | |
| CUDA設定: | |
| - CuDNN Benchmark: {torch.backends.cudnn.benchmark} | |
| - TensorFloat-32: {torch.backends.cuda.matmul.allow_tf32} | |
| - Matmul Precision: high | |
| メモリ使用状況: | |
| """ | |
| mem_stats = self.get_memory_stats() | |
| summary += f"- 割り当て済み: {mem_stats['allocated_memory_mb']:.1f} MB\n" | |
| summary += f"- 予約済み: {mem_stats['reserved_memory_mb']:.1f} MB\n" | |
| summary += f"- 最大使用量: {mem_stats['max_memory_mb']:.1f} MB\n" | |
| return summary | |
| class OptimizedInference: | |
| """ | |
| 最適化された推論パイプライン | |
| """ | |
| def __init__(self, gpu_optimizer: Optional[GPUOptimizer] = None): | |
| """ | |
| Initialize optimized inference | |
| Args: | |
| gpu_optimizer: GPUオプティマイザー(Noneの場合新規作成) | |
| """ | |
| self.gpu_optimizer = gpu_optimizer or GPUOptimizer() | |
| def run_inference( | |
| self, | |
| model: torch.nn.Module, | |
| audio: torch.Tensor, | |
| image: torch.Tensor, | |
| **kwargs | |
| ) -> torch.Tensor: | |
| """ | |
| 最適化された推論を実行 | |
| Args: | |
| model: 使用するモデル | |
| audio: 音声データ | |
| image: 画像データ | |
| **kwargs: その他のパラメータ | |
| Returns: | |
| 推論結果 | |
| """ | |
| # モデルを評価モードに | |
| model.eval() | |
| # GPU最適化を使用して推論 | |
| result = self.gpu_optimizer.process_batch_optimized( | |
| model, audio, image, use_amp=True | |
| ) | |
| return result |