Spaces:
Runtime error
Runtime error
File size: 2,616 Bytes
42bf28c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
"""
"""
import torch
import gc
def clear_gpu_cache():
"""Frees up GPU memory by clearing cache and collecting garbage."""
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
gc.collect()
print("β GPU cache cleared.")
def get_max_memory(memory_fraction=0.85, cpu_memory="50GB"):
"""
Automatically configure max memory per GPU.
When used with device_map="auto", this tells the model loader how much memory
it CAN use per GPU during the INITIAL model loading phase. If a model's layers
don't fit on one GPU with this limit, the loader will automatically split the
model across multiple GPUs.
Args:
memory_fraction: Fraction of GPU memory to allocate (0.0-1.0).
Default 0.85 leaves 15% headroom.
cpu_memory: Maximum CPU memory to use as offload space.
Returns:
dict: Memory limits per device, or None if no CUDA available
"""
if not torch.cuda.is_available():
print("β No CUDA GPUs available")
return None
max_memory = {}
total_available = 0
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
total_memory = props.total_memory
usable_memory = int(total_memory * memory_fraction)
max_memory[i] = usable_memory
total_available += usable_memory
print(f"GPU {i} ({props.name}): "
f"{usable_memory / 1024**3:.2f}GB / {total_memory / 1024**3:.2f}GB "
f"({memory_fraction*100:.0f}% limit)")
# CPU memory for offloading if needed
max_memory["cpu"] = cpu_memory
print(f"β Total GPU memory available for models: {total_available / 1024**3:.2f}GB")
print(f"β CPU offload memory: {cpu_memory}")
return max_memory
def monitor_and_clear_cache(threshold=0.90):
"""
Monitor GPU memory and clear cache if usage exceeds threshold.
Call this periodically during long-running operations.
Args:
threshold: Memory usage fraction (0.0-1.0) that triggers cache clearing
"""
if not torch.cuda.is_available():
return
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
allocated = torch.cuda.memory_allocated(i)
total = props.total_memory
usage = allocated / total
if usage > threshold:
print(f"β GPU {i} usage at {usage*100:.1f}%, clearing cache...")
torch.cuda.empty_cache()
gc.collect()
|