VibeVoice7b-low-vram / 4bit /test_accurate_vram.py
Parker
Upload 5 files
2daa24e verified
#!/usr/bin/env python
"""
Accurate VRAM measurement for VibeVoice models
Shows the difference between allocated vs reserved memory
"""
import os
import gc
import torch
import subprocess
import time
from pathlib import Path
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
def get_gpu_memory_info():
"""Get detailed GPU memory information"""
if not torch.cuda.is_available():
return {}
# PyTorch memory stats
allocated = torch.cuda.memory_allocated() / 1e9
reserved = torch.cuda.memory_reserved() / 1e9
# Get nvidia-smi info
try:
result = subprocess.run([
'nvidia-smi',
'--query-gpu=memory.used,memory.total',
'--format=csv,nounits,noheader'
], capture_output=True, text=True)
if result.returncode == 0:
used, total = map(int, result.stdout.strip().split(','))
nvidia_used_gb = used / 1024 # Convert MB to GB
nvidia_total_gb = total / 1024
else:
nvidia_used_gb = 0
nvidia_total_gb = 0
except:
nvidia_used_gb = 0
nvidia_total_gb = 0
return {
'allocated': allocated,
'reserved': reserved,
'nvidia_smi': nvidia_used_gb,
'nvidia_total': nvidia_total_gb
}
def print_memory_report(label, before, after):
"""Print detailed memory usage report"""
print(f"\n{label}:")
print(f" PyTorch Allocated: {before['allocated']:.2f} GB → {after['allocated']:.2f} GB "
f"(+{after['allocated'] - before['allocated']:.2f} GB)")
print(f" PyTorch Reserved: {before['reserved']:.2f} GB → {after['reserved']:.2f} GB "
f"(+{after['reserved'] - before['reserved']:.2f} GB)")
print(f" nvidia-smi Total: {before['nvidia_smi']:.2f} GB → {after['nvidia_smi']:.2f} GB "
f"(+{after['nvidia_smi'] - before['nvidia_smi']:.2f} GB)")
print(f" Memory Overhead: {after['reserved'] - after['allocated']:.2f} GB (PyTorch cache)")
def clear_gpu_memory():
"""Aggressively clear GPU memory"""
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
# Force memory pool cleanup
torch.cuda.reset_peak_memory_stats()
def test_model_memory(model_path, model_name):
"""Test model with detailed memory tracking"""
print(f"\n{'='*70}")
print(f"Testing {model_name}")
print(f"{'='*70}")
# Clear memory and get baseline
clear_gpu_memory()
time.sleep(2) # Let memory settle
baseline = get_gpu_memory_info()
print(f"\nBaseline GPU Memory:")
print(f" PyTorch Allocated: {baseline['allocated']:.2f} GB")
print(f" PyTorch Reserved: {baseline['reserved']:.2f} GB")
print(f" nvidia-smi Shows: {baseline['nvidia_smi']:.2f} GB / {baseline['nvidia_total']:.2f} GB")
# Load model
print(f"\nLoading {model_name}...")
load_start = time.time()
processor = VibeVoiceProcessor.from_pretrained(model_path)
model = VibeVoiceForConditionalGenerationInference.from_pretrained(
model_path,
device_map='cuda',
torch_dtype=torch.bfloat16,
)
model.eval()
load_time = time.time() - load_start
# Get memory after loading
loaded = get_gpu_memory_info()
print_memory_report("After Model Loading", baseline, loaded)
# Test generation to see peak usage
print(f"\nTesting generation...")
test_text = "Speaker 1: Testing memory usage. Speaker 2: Let's see the results!"
voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
speaker_voices = [
os.path.join(voices_dir, "en-Alice_woman.wav"),
os.path.join(voices_dir, "en-Carter_man.wav")
]
inputs = processor(
text=[test_text],
voice_samples=[speaker_voices],
padding=True,
return_tensors="pt",
return_attention_mask=True,
)
# Monitor during generation
pre_gen = get_gpu_memory_info()
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=None,
cfg_scale=1.3,
tokenizer=processor.tokenizer,
generation_config={'do_sample': False},
)
post_gen = get_gpu_memory_info()
print_memory_report("During Generation", pre_gen, post_gen)
# Peak memory stats
if torch.cuda.is_available():
peak_memory = torch.cuda.max_memory_allocated() / 1e9
peak_reserved = torch.cuda.max_memory_reserved() / 1e9
print(f"\nPeak Memory Usage:")
print(f" Peak Allocated: {peak_memory:.2f} GB")
print(f" Peak Reserved: {peak_reserved:.2f} GB")
# Clean up
del model
del processor
clear_gpu_memory()
return {
'name': model_name,
'allocated': loaded['allocated'] - baseline['allocated'],
'reserved': loaded['reserved'] - baseline['reserved'],
'nvidia_smi': loaded['nvidia_smi'] - baseline['nvidia_smi'],
'peak_allocated': peak_memory,
'peak_reserved': peak_reserved
}
def main():
print("="*70)
print("ACCURATE VRAM MEASUREMENT FOR VIBEVOICE")
print("="*70)
print("\nNote: PyTorch reserves extra memory for efficiency.")
print("nvidia-smi shows total reserved memory, not just allocated.")
models = [
{
"path": "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-pt",
"name": "16-bit Original"
},
{
"path": "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit",
"name": "4-bit Quantized"
}
]
results = []
for model_info in models:
try:
result = test_model_memory(model_info["path"], model_info["name"])
results.append(result)
time.sleep(5)
except Exception as e:
print(f"Error testing {model_info['name']}: {e}")
# Summary
print("\n" + "="*70)
print("MEMORY USAGE SUMMARY")
print("="*70)
print(f"\n{'Model':<20} {'Allocated':<12} {'Reserved':<12} {'nvidia-smi':<12} {'Peak':<12}")
print("-"*70)
for r in results:
print(f"{r['name']:<20} "
f"{r['allocated']:<12.2f} "
f"{r['reserved']:<12.2f} "
f"{r['nvidia_smi']:<12.2f} "
f"{r['peak_allocated']:<12.2f}")
print("\n💡 Key Insights:")
print("- 'Allocated' = Actual model weights in memory")
print("- 'Reserved' = Total GPU memory reserved by PyTorch (includes cache)")
print("- 'nvidia-smi' = What nvidia-smi reports (includes all overhead)")
print("- The difference is PyTorch's memory pool for efficiency")
if __name__ == "__main__":
main()