import torch
import time
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os
import sys
import numpy as np

def run_benchmark():
    print("=" * 60)
    print(f"🚀 SYSTEM BENCHMARK on {os.uname().nodename}")
    print("=" * 60)
    
    print(f"Python: {sys.version.split()[0]}")
    print(f"PyTorch: {torch.__version__}")
    
    cuda_available = torch.cuda.is_available()
    if cuda_available:
        gpu_name = torch.cuda.get_device_name(0)
        print(f"✅ GPU DETECTED: {gpu_name}")
        print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
    else:
        print("⚠️  NO GPU DETECTED. Running on CPU.")
        print("   (To enable GPU, switch Hardware in HF Space settings)")
        
    print("-" * 60)
    
    times = {}
    
    # MATRIX SIZE
    N = 4000
    
    # CPU TEST
    print(f"1️⃣  CPU Test ({N}x{N} Matrix Mul)...")
    start_time = time.time()
    a_cpu = torch.randn(N, N)
    b_cpu = torch.randn(N, N)
    c_cpu = torch.matmul(a_cpu, b_cpu)
    cpu_time = time.time() - start_time
    times['CPU'] = cpu_time
    print(f"   ⏱️  Time: {cpu_time:.4f} seconds")
    
    # GPU TEST
    if cuda_available:
        print(f"2️⃣  GPU Test ({N}x{N} Matrix Mul)...")
        # Warmup
        torch.matmul(torch.randn(100,100).cuda(), torch.randn(100,100).cuda())
        
        start_time = time.time()
        a_gpu = torch.randn(N, N).cuda()
        b_gpu = torch.randn(N, N).cuda()
        # Synchronize for accurate timing
        torch.cuda.synchronize()
        start_computation = time.time()
        c_gpu = torch.matmul(a_gpu, b_gpu)
        torch.cuda.synchronize()
        gpu_time = time.time() - start_computation
        times['GPU'] = gpu_time
        print(f"   ⏱️  Time: {gpu_time:.4f} seconds")
        
        speedup = cpu_time / gpu_time
        print(f"   🚀 SPEEDUP: {speedup:.2f}x")
    else:
        print("2️⃣  GPU Test SKIPPED (No CUDA)")
        times['GPU'] = 0

    # PLOT
    print("-" * 60)
    print("Creating comparison chart...")
    plt.figure(figsize=(10, 6))
    
    models = list(times.keys())
    durations = list(times.values())
    colors = ['gray', 'green'] if cuda_available else ['gray', 'red']
    
    bars = plt.bar(models, durations, color=colors)
    plt.ylabel('Secons (Lower is better)')
    plt.title(f'Benchmark CPU vs GPU ({N}x{N} Matrix Mul)\n{gpu_name if cuda_available else "CPU Only"}')
    
    for bar in bars:
        yval = bar.get_height()
        if yval > 0:
            plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.4f}s', ha='center', va='bottom')
            
    filename = "gpu_benchmark.png"
    plt.savefig(filename)
    print(f"💾 Saved to {filename}")

if __name__ == "__main__":
    run_benchmark()