VibeVoice7b-low-vram / 4bit /test_accurate_vram.py

Parker

Upload 5 files

2daa24e verified 8 months ago

6.91 kB

	#!/usr/bin/env python
	"""
	Accurate VRAM measurement for VibeVoice models
	Shows the difference between allocated vs reserved memory
	"""

	import os
	import gc
	import torch
	import subprocess
	import time
	from pathlib import Path
	from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
	from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor

	def get_gpu_memory_info():
	"""Get detailed GPU memory information"""
	if not torch.cuda.is_available():
	return {}

	# PyTorch memory stats
	allocated = torch.cuda.memory_allocated() / 1e9
	reserved = torch.cuda.memory_reserved() / 1e9

	# Get nvidia-smi info
	try:
	result = subprocess.run([
	'nvidia-smi',
	'--query-gpu=memory.used,memory.total',
	'--format=csv,nounits,noheader'
	], capture_output=True, text=True)

	if result.returncode == 0:
	used, total = map(int, result.stdout.strip().split(','))
	nvidia_used_gb = used / 1024 # Convert MB to GB
	nvidia_total_gb = total / 1024
	else:
	nvidia_used_gb = 0
	nvidia_total_gb = 0
	except:
	nvidia_used_gb = 0
	nvidia_total_gb = 0

	return {
	'allocated': allocated,
	'reserved': reserved,
	'nvidia_smi': nvidia_used_gb,
	'nvidia_total': nvidia_total_gb
	}

	def print_memory_report(label, before, after):
	"""Print detailed memory usage report"""
	print(f"\n{label}:")
	print(f" PyTorch Allocated: {before['allocated']:.2f} GB → {after['allocated']:.2f} GB "
	f"(+{after['allocated'] - before['allocated']:.2f} GB)")
	print(f" PyTorch Reserved: {before['reserved']:.2f} GB → {after['reserved']:.2f} GB "
	f"(+{after['reserved'] - before['reserved']:.2f} GB)")
	print(f" nvidia-smi Total: {before['nvidia_smi']:.2f} GB → {after['nvidia_smi']:.2f} GB "
	f"(+{after['nvidia_smi'] - before['nvidia_smi']:.2f} GB)")
	print(f" Memory Overhead: {after['reserved'] - after['allocated']:.2f} GB (PyTorch cache)")

	def clear_gpu_memory():
	"""Aggressively clear GPU memory"""
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.synchronize()
	# Force memory pool cleanup
	torch.cuda.reset_peak_memory_stats()

	def test_model_memory(model_path, model_name):
	"""Test model with detailed memory tracking"""
	print(f"\n{'='*70}")
	print(f"Testing {model_name}")
	print(f"{'='*70}")

	# Clear memory and get baseline
	clear_gpu_memory()
	time.sleep(2) # Let memory settle

	baseline = get_gpu_memory_info()
	print(f"\nBaseline GPU Memory:")
	print(f" PyTorch Allocated: {baseline['allocated']:.2f} GB")
	print(f" PyTorch Reserved: {baseline['reserved']:.2f} GB")
	print(f" nvidia-smi Shows: {baseline['nvidia_smi']:.2f} GB / {baseline['nvidia_total']:.2f} GB")

	# Load model
	print(f"\nLoading {model_name}...")
	load_start = time.time()

	processor = VibeVoiceProcessor.from_pretrained(model_path)
	model = VibeVoiceForConditionalGenerationInference.from_pretrained(
	model_path,
	device_map='cuda',
	torch_dtype=torch.bfloat16,
	)
	model.eval()

	load_time = time.time() - load_start

	# Get memory after loading
	loaded = get_gpu_memory_info()
	print_memory_report("After Model Loading", baseline, loaded)

	# Test generation to see peak usage
	print(f"\nTesting generation...")
	test_text = "Speaker 1: Testing memory usage. Speaker 2: Let's see the results!"
	voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
	speaker_voices = [
	os.path.join(voices_dir, "en-Alice_woman.wav"),
	os.path.join(voices_dir, "en-Carter_man.wav")
	]

	inputs = processor(
	text=[test_text],
	voice_samples=[speaker_voices],
	padding=True,
	return_tensors="pt",
	return_attention_mask=True,
	)

	# Monitor during generation
	pre_gen = get_gpu_memory_info()

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=None,
	cfg_scale=1.3,
	tokenizer=processor.tokenizer,
	generation_config={'do_sample': False},
	)

	post_gen = get_gpu_memory_info()
	print_memory_report("During Generation", pre_gen, post_gen)

	# Peak memory stats
	if torch.cuda.is_available():
	peak_memory = torch.cuda.max_memory_allocated() / 1e9
	peak_reserved = torch.cuda.max_memory_reserved() / 1e9
	print(f"\nPeak Memory Usage:")
	print(f" Peak Allocated: {peak_memory:.2f} GB")
	print(f" Peak Reserved: {peak_reserved:.2f} GB")

	# Clean up
	del model
	del processor
	clear_gpu_memory()

	return {
	'name': model_name,
	'allocated': loaded['allocated'] - baseline['allocated'],
	'reserved': loaded['reserved'] - baseline['reserved'],
	'nvidia_smi': loaded['nvidia_smi'] - baseline['nvidia_smi'],
	'peak_allocated': peak_memory,
	'peak_reserved': peak_reserved
	}

	def main():
	print("="*70)
	print("ACCURATE VRAM MEASUREMENT FOR VIBEVOICE")
	print("="*70)
	print("\nNote: PyTorch reserves extra memory for efficiency.")
	print("nvidia-smi shows total reserved memory, not just allocated.")

	models = [
	{
	"path": "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-pt",
	"name": "16-bit Original"
	},
	{
	"path": "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit",
	"name": "4-bit Quantized"
	}
	]

	results = []
	for model_info in models:
	try:
	result = test_model_memory(model_info["path"], model_info["name"])
	results.append(result)
	time.sleep(5)
	except Exception as e:
	print(f"Error testing {model_info['name']}: {e}")

	# Summary
	print("\n" + "="*70)
	print("MEMORY USAGE SUMMARY")
	print("="*70)
	print(f"\n{'Model':<20} {'Allocated':<12} {'Reserved':<12} {'nvidia-smi':<12} {'Peak':<12}")
	print("-"*70)

	for r in results:
	print(f"{r['name']:<20} "
	f"{r['allocated']:<12.2f} "
	f"{r['reserved']:<12.2f} "
	f"{r['nvidia_smi']:<12.2f} "
	f"{r['peak_allocated']:<12.2f}")

	print("\n💡 Key Insights:")
	print("- 'Allocated' = Actual model weights in memory")
	print("- 'Reserved' = Total GPU memory reserved by PyTorch (includes cache)")
	print("- 'nvidia-smi' = What nvidia-smi reports (includes all overhead)")
	print("- The difference is PyTorch's memory pool for efficiency")

	if __name__ == "__main__":
	main()