File size: 2,616 Bytes
42bf28c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
"""

"""

import torch
import gc

def clear_gpu_cache():
    """Frees up GPU memory by clearing cache and collecting garbage."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        gc.collect()
        print("βœ“ GPU cache cleared.")


def get_max_memory(memory_fraction=0.85, cpu_memory="50GB"):
    """
    Automatically configure max memory per GPU.
    
    When used with device_map="auto", this tells the model loader how much memory
    it CAN use per GPU during the INITIAL model loading phase. If a model's layers
    don't fit on one GPU with this limit, the loader will automatically split the
    model across multiple GPUs.
    
    Args:
        memory_fraction: Fraction of GPU memory to allocate (0.0-1.0). 
                        Default 0.85 leaves 15% headroom.
        cpu_memory: Maximum CPU memory to use as offload space.
    
    Returns:
        dict: Memory limits per device, or None if no CUDA available
    """
    if not torch.cuda.is_available():
        print("⚠ No CUDA GPUs available")
        return None
    
    max_memory = {}
    total_available = 0
    
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        total_memory = props.total_memory
        usable_memory = int(total_memory * memory_fraction)
        max_memory[i] = usable_memory
        total_available += usable_memory
        
        print(f"GPU {i} ({props.name}): "
              f"{usable_memory / 1024**3:.2f}GB / {total_memory / 1024**3:.2f}GB "
              f"({memory_fraction*100:.0f}% limit)")
    
    # CPU memory for offloading if needed
    max_memory["cpu"] = cpu_memory
    
    print(f"βœ“ Total GPU memory available for models: {total_available / 1024**3:.2f}GB")
    print(f"βœ“ CPU offload memory: {cpu_memory}")
    
    return max_memory

def monitor_and_clear_cache(threshold=0.90):
    """
    Monitor GPU memory and clear cache if usage exceeds threshold.
    Call this periodically during long-running operations.
    
    Args:
        threshold: Memory usage fraction (0.0-1.0) that triggers cache clearing
    """
    if not torch.cuda.is_available():
        return
    
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        allocated = torch.cuda.memory_allocated(i)
        total = props.total_memory
        usage = allocated / total
        
        if usage > threshold:
            print(f"⚠ GPU {i} usage at {usage*100:.1f}%, clearing cache...")
            torch.cuda.empty_cache()
            gc.collect()