"""Profile model performance and memory usage"""
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))

import torch
import time
import argparse
from torch.profiler import profile, ProfilerActivity, record_function
from src.models.ultrathink import UltraThinkModel, UltraThinkConfig
from src.models.architecture import ModelConfig


def profile_model(model, input_shape=(2, 512), device='cuda', num_iters=10):
    """Profile model forward and backward pass"""
    print(f"Profiling model on {device}")
    print(f"Input shape: {input_shape}")
    print(f"Number of iterations: {num_iters}")
    print("=" * 60)
    
    model = model.to(device)
    model.train()
    
    vocab_size = model.config.model_config.vocab_size
    batch_size, seq_length = input_shape
    
    # Dummy input
    input_ids = torch.randint(0, vocab_size, input_shape, device=device)
    labels = torch.randint(0, vocab_size, input_shape, device=device)
    
    print("Warming up...")
    # Warmup
    for _ in range(5):
        with torch.no_grad():
            output = model(input_ids=input_ids, labels=labels)
        if device == 'cuda':
            torch.cuda.synchronize()
    
    print("Profiling...")
    # Profile
    with profile(
        activities=[ProfilerActivity.CPU] + ([ProfilerActivity.CUDA] if device == 'cuda' else []),
        record_shapes=True,
        profile_memory=True,
        with_stack=True
    ) as prof:
        for i in range(num_iters):
            with record_function(f"iteration_{i}"):
                output = model(input_ids=input_ids, labels=labels)
                loss = output['loss']
                loss.backward()
                
                if device == 'cuda':
                    torch.cuda.synchronize()
    
    # Print results
    print("\n" + "=" * 60)
    print("Top 20 Operations by Time")
    print("=" * 60)
    sort_key = "cuda_time_total" if device == 'cuda' else "cpu_time_total"
    print(prof.key_averages().table(
        sort_by=sort_key, row_limit=20
    ))
    
    # Memory stats
    if torch.cuda.is_available() and device == 'cuda':
        print("\n" + "=" * 60)
        print("GPU Memory Statistics")
        print("=" * 60)
        print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
        print(f"Reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
        print(f"Max Allocated: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
        print(f"Max Reserved: {torch.cuda.max_memory_reserved() / 1e9:.2f} GB")
    
    # Timing stats
    print("\n" + "=" * 60)
    print("Performance Metrics")
    print("=" * 60)
    
    # Measure inference time
    torch.cuda.synchronize() if device == 'cuda' else None
    start = time.time()
    for _ in range(10):
        with torch.no_grad():
            _ = model(input_ids=input_ids)
        torch.cuda.synchronize() if device == 'cuda' else None
    end = time.time()
    
    avg_time = (end - start) / 10
    tokens_per_sec = (batch_size * seq_length) / avg_time
    
    print(f"Average forward pass time: {avg_time * 1000:.2f} ms")
    print(f"Throughput: {tokens_per_sec:.0f} tokens/second")
    print(f"Throughput: {batch_size / avg_time:.2f} samples/second")
    
    return prof


def build_test_model(size='tiny'):
    """Build a test model of specified size"""
    configs = {
        'tiny': {
            'n_embd': 256,
            'n_layer': 4,
            'n_head': 4,
            'intermediate_size': 1024
        },
        'small': {
            'n_embd': 768,
            'n_layer': 12,
            'n_head': 12,
            'intermediate_size': 3072
        },
        'medium': {
            'n_embd': 1024,
            'n_layer': 24,
            'n_head': 16,
            'intermediate_size': 4096
        }
    }
    
    model_params = configs.get(size, configs['tiny'])
    
    model_config = ModelConfig(
        vocab_size=50257,
        n_positions=512,
        n_embd=model_params['n_embd'],
        n_layer=model_params['n_layer'],
        n_head=model_params['n_head'],
        n_kv_head=model_params['n_head'] // 2,
        intermediate_size=model_params['intermediate_size'],
        flash_attention=False,  # For CPU compatibility
        gradient_checkpointing=False
    )
    
    config = UltraThinkConfig(
        model_config=model_config,
        enable_dre=False,
        enable_constitutional=False,
        enable_moe=False,
        enable_multimodal=False,
        enable_rlhf=False
    )
    
    return UltraThinkModel(config)


def main():
    parser = argparse.ArgumentParser(description='Profile ULTRATHINK model')
    parser.add_argument('--size', type=str, default='tiny', choices=['tiny', 'small', 'medium'],
                        help='Model size')
    parser.add_argument('--batch_size', type=int, default=2, help='Batch size')
    parser.add_argument('--seq_length', type=int, default=512, help='Sequence length')
    parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu',
                        help='Device to use')
    parser.add_argument('--num_iters', type=int, default=10, help='Number of profiling iterations')
    parser.add_argument('--export_trace', type=str, default=None,
                        help='Path to export Chrome trace')
    
    args = parser.parse_args()
    
    print("Building model...")
    model = build_test_model(args.size)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"Model size: {args.size}")
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Parameter size: {total_params * 4 / 1e9:.2f} GB (float32)")
    print()
    
    # Profile
    prof = profile_model(
        model,
        input_shape=(args.batch_size, args.seq_length),
        device=args.device,
        num_iters=args.num_iters
    )
    
    # Export trace
    if args.export_trace:
        prof.export_chrome_trace(args.export_trace)
        print(f"\nTrace exported to: {args.export_trace}")
        print("View in Chrome at: chrome://tracing")


if __name__ == "__main__":
    main()