import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import spaces

# Model configuration
BASE_MODEL = "unsloth/qwen3-30b-a3b"
LORA_ADAPTER_PATH = "AI-Talent-Force/ceo-voice-lora-qwen3-30b"

# Load model and tokenizer at startup (once)
print("=" * 60)
print("🚀 INITIALIZING CEO AI EXECUTIVE")
print("=" * 60)

print("\n[1/4] Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
print("✓ Tokenizer loaded successfully!")

print("\n[2/4] Configuring 4-bit quantization...")
# Use 4-bit quantization to fit in GPU memory
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)
print("✓ Quantization config ready!")

print("\n[3/4] Loading base model (Qwen3-30B)...")
print("⏳ This may take 2-3 minutes - downloading and quantizing 30B parameters...")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)
print("✓ Base model loaded successfully!")

print("\n[4/4] Loading LoRA adapter (CEO fine-tuning)...")
model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)
model.eval()
print("✓ LoRA adapter loaded successfully!")

print("\n" + "=" * 60)
print("🎯 CEO AI EXECUTIVE IS READY!")
print("=" * 60)
print("Model is loaded in memory and ready for fast inference.\n")

@spaces.GPU(duration=60)
def chat_with_ceo(message, history):
    """
    Chat function that responds like the CEO
    Args:
        message: User's current message
        history: List of previous message tuples [(user_msg, bot_msg), ...]
    """
    # Build conversation context (limit history to last 5 exchanges for speed)
    conversation = []

    # Process history - ChatInterface passes history as list of tuples
    recent_history = history[-5:] if len(history) > 5 else history  # Last 5 exchanges
    for user_msg, bot_msg in recent_history:
        conversation.append({"role": "user", "content": user_msg})
        conversation.append({"role": "assistant", "content": bot_msg})

    # Add current message
    conversation.append({"role": "user", "content": message})

    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        conversation,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncate=True, max_length=2048)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate response with optimized parameters for speed
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True
        )

    # Decode response
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    # Return just the response string - ChatInterface handles the history
    return response

# Create Gradio ChatInterface
demo = gr.ChatInterface(
    fn=chat_with_ceo,
    title="🎯 CEO AI Executive",
    description="""Chat with an AI trained on your CEO's writing style and thoughts.

✅ **Model Status:** Loaded and ready! The model is kept in memory for fast responses.""",
    examples=[
        "What's your vision for the company?",
        "How do you approach leadership?",
        "What are your thoughts on innovation?",
        "Can you share your perspective on team building?",
        "What drives your business strategy?"
    ],
    chatbot=gr.Chatbot(height=500)
)

if __name__ == "__main__":
    demo.launch()