File size: 6,391 Bytes
38572a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""
GPU Memory Manager for InfiniteTalk
Handles memory monitoring, cleanup, and optimization
"""

import torch
import logging
from typing import Optional

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class GPUManager:
    """Manages GPU memory usage and optimization"""

    def __init__(self, max_memory_gb=65):
        """
        Initialize GPU Manager

        Args:
            max_memory_gb: Maximum memory threshold in GB (default 65GB for 70GB H200)
        """
        self.max_memory_bytes = max_memory_gb * 1024 ** 3
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def get_memory_usage(self):
        """
        Get current GPU memory usage

        Returns:
            dict with allocated, reserved, and free memory in GB
        """
        if not torch.cuda.is_available():
            return {"allocated": 0, "reserved": 0, "free": 0}

        allocated = torch.cuda.memory_allocated() / 1024 ** 3
        reserved = torch.cuda.memory_reserved() / 1024 ** 3
        total = torch.cuda.get_device_properties(0).total_memory / 1024 ** 3
        free = total - allocated

        return {
            "allocated": round(allocated, 2),
            "reserved": round(reserved, 2),
            "free": round(free, 2),
            "total": round(total, 2)
        }

    def print_memory_usage(self, prefix=""):
        """Print current memory usage"""
        usage = self.get_memory_usage()
        logger.info(
            f"{prefix}GPU Memory - "
            f"Allocated: {usage['allocated']}GB, "
            f"Reserved: {usage['reserved']}GB, "
            f"Free: {usage['free']}GB"
        )

    def check_memory_threshold(self):
        """
        Check if memory usage exceeds threshold

        Returns:
            bool: True if within safe limits, False if exceeded
        """
        if not torch.cuda.is_available():
            return True

        allocated = torch.cuda.memory_allocated()

        if allocated > self.max_memory_bytes:
            logger.warning(
                f"Memory threshold exceeded! "
                f"Allocated: {allocated / 1024**3:.2f}GB, "
                f"Threshold: {self.max_memory_bytes / 1024**3:.2f}GB"
            )
            return False

        return True

    def cleanup(self):
        """Perform garbage collection and CUDA cache cleanup"""
        import gc

        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()

        logger.info("GPU memory cleaned up")
        self.print_memory_usage("After cleanup - ")

    def optimize_model_for_inference(self, model):
        """
        Apply optimizations to model for inference

        Args:
            model: PyTorch model to optimize

        Returns:
            Optimized model
        """
        model.eval()

        # Enable gradient checkpointing if available
        if hasattr(model, "enable_gradient_checkpointing"):
            model.enable_gradient_checkpointing()

        # Use FP16 for inference to save memory
        if torch.cuda.is_available() and hasattr(model, "half"):
            logger.info("Converting model to FP16")
            model = model.half()

        return model

    def enable_memory_efficient_attention(self):
        """Enable memory-efficient attention mechanisms"""
        try:
            import xformers

            logger.info("xformers available - memory efficient attention enabled")
            return True
        except ImportError:
            logger.warning("xformers not available - using standard attention")
            return False

    def estimate_inference_memory(self, resolution="480p", duration_seconds=10):
        """
        Estimate memory requirements for inference

        Args:
            resolution: Video resolution (480p or 720p)
            duration_seconds: Video duration in seconds

        Returns:
            Estimated memory in GB
        """
        base_memory = 20  # Base model memory

        if resolution == "720p":
            per_second_memory = 1.5
        else:  # 480p
            per_second_memory = 0.8

        estimated = base_memory + (duration_seconds * per_second_memory)

        logger.info(
            f"Estimated memory for {resolution} video ({duration_seconds}s): "
            f"{estimated:.2f}GB"
        )

        return estimated

    def should_use_chunking(self, video_duration, resolution="480p"):
        """
        Determine if chunked processing should be used

        Args:
            video_duration: Duration in seconds
            resolution: Video resolution

        Returns:
            bool: True if chunking recommended
        """
        estimated_memory = self.estimate_inference_memory(resolution, video_duration)

        # Use chunking if estimated memory exceeds 50GB
        return estimated_memory > 50

    def get_optimal_chunk_size(self, resolution="480p"):
        """
        Get optimal chunk size for video processing

        Args:
            resolution: Video resolution

        Returns:
            Optimal chunk size in seconds
        """
        if resolution == "720p":
            return 10  # 10 second chunks for 720p
        else:
            return 15  # 15 second chunks for 480p

    @staticmethod
    def calculate_duration_for_zerogpu(video_duration, resolution="480p"):
        """
        Calculate ZeroGPU duration parameter

        Args:
            video_duration: Duration of video in seconds
            resolution: Video resolution

        Returns:
            Recommended duration for @spaces.GPU decorator
        """
        base_time = 60  # Base time for model loading

        # Processing time per second of video
        if resolution == "720p":
            processing_rate = 3.5
        else:  # 480p
            processing_rate = 2.5

        # Add safety margin of 1.2x
        estimated_time = base_time + (video_duration * processing_rate)
        duration = int(estimated_time * 1.2)

        # Cap at 300 seconds for free tier (300s ZeroGPU = 10 min real time)
        duration = min(duration, 300)

        logger.info(
            f"Calculated ZeroGPU duration: {duration}s for "
            f"{video_duration}s {resolution} video"
        )

        return duration


# Global instance
gpu_manager = GPUManager()