import torch
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent))

def check_model_memory():
    # Simple config object
    class SimpleConfig:
        class ModelConfig:
            decoder_depth = 4
        model = ModelConfig()
    
    cfg = SimpleConfig()
    
    # Import after path is set
    from dpm.model import VDPM
    
    # Create model on CPU first to count parameters
    print("Creating model...")
    model = VDPM(cfg)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"\n{'='*60}")
    print(f"MODEL SIZE ANALYSIS FOR RTX 3070 Ti (8GB)")
    print(f"{'='*60}")
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"\nEstimated model weights memory:")
    print(f"  - FP32 (float32): {total_params * 4 / 1024**3:.2f} GB")
    print(f"  - FP16 (float16): {total_params * 2 / 1024**3:.2f} GB")
    print(f"  - BF16 (bfloat16): {total_params * 2 / 1024**3:.2f} GB")
    print(f"  - INT8 (quantized): {total_params * 1 / 1024**3:.2f} GB  <-- RECOMMENDED for 8GB GPU")
    
    # Estimate activation memory for typical input
    batch_size = 1
    num_frames = 5  # typical video length
    img_size = 518
    print(f"\nEstimated activation memory (batch={batch_size}, frames={num_frames}, img_size={img_size}):")
    
    # Input images: [B, S, 3, H, W]
    input_mem = batch_size * num_frames * 3 * img_size * img_size * 4 / 1024**3
    print(f"  - Input images (FP32): {input_mem:.2f} GB")
    
    # Rough estimate for activations (can be 2-4x model size during forward pass)
    activation_mem_estimate = total_params * 2 * 3 / 1024**3  # conservative estimate
    print(f"  - Activations (estimate): {activation_mem_estimate:.2f} GB")
    
    # Calculate total for different precision modes
    total_fp16 = (total_params * 2 / 1024**3) + input_mem + activation_mem_estimate
    total_int8 = (total_params * 1 / 1024**3) + input_mem + (activation_mem_estimate * 0.6)  # INT8 reduces activations too
    
    print(f"\nTotal estimated GPU memory needed:")
    print(f"  - With FP16/BF16: {total_fp16:.2f} GB")
    print(f"  - With INT8 quantization: {total_int8:.2f} GB  <-- FITS IN 8GB!")
    print(f"Your RTX 3070 Ti has: 8 GB VRAM")
    
    if total_int8 <= 8:
        print(f"\n✓ With INT8 quantization, model will fit in GPU memory!")
        print(f"  Set USE_QUANTIZATION = True in gradio_demo.py")
    elif total_fp16 > 8:
        print(f"\n⚠️  WARNING: Even with INT8 ({total_int8:.2f} GB), memory is tight")
        print(f"    Recommendations:")
        print(f"  1. Use INT8 quantization (USE_QUANTIZATION = True)")
        print(f"  2. Reduce number of input frames to {num_frames} or fewer")
        print(f"  3. Clear CUDA cache between batches")
    else:
        print(f"\n✓ Model should fit with FP16!")
    
    print(f"{'='*60}\n")
    
    # Check actual GPU memory if CUDA available
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name(0)}")
        print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        print(f"Current GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
        print(f"Current GPU memory cached: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

if __name__ == "__main__":
    check_model_memory()