import gradio as gr
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread

# 1. Define the Available Models Library
# Removed all proprietary/unavailable models. All models here have public weights.
MODELS = {
    # --- 🟢 Free Tier Friendly (Smooth on 16GB CPU) ---
    "🟢 Llama-3.2-1B-Tele-it (1 Billion)": "AliMaatouk/Llama-3.2-1B-Tele-it",
    "🟢 Qwen-2.5-Coder-1.5B (1.5 Billion)": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
    " 🟢 TSLAM-mini-2B": "NetoAISolutions/TSLAM-Mini-2B",
    
    # --- 🔴 Mid-Size (Slow on Free Tier, requires patience or basic GPU) ---
    "🔴 Llama-3.2-3B-Tele-it (3 Billion)": "AliMaatouk/Llama-3.2-3B-Tele-it",
    "🔴 TSLAM-4B (4 Billion - Gated Model)": "NetoAISolutions/TSLAM-4B",
    
    # --- 🔴 Massive Models (Will crash Free Tier - Requires A100/H100 GPU clusters) ---
    "🔴 LTM / AdaptKey-Nemotron-30b (30 Billion)": "AdaptKey/AdaptKey-Nemotron-30b",
    "🔴 A.X K1 by SK Telecom (519 Billion MoE)": "skt/A.X-K1",
    "🔴 Kimi K2.6 by Moonshot AI (1 Trillion MoE)": "moonshotai/Kimi-K2.6"
}

current_model_id = None
tokenizer = None
model = None

# 2. Memory Management and Model Loading
def load_model(model_selection, hf_token):
    global current_model_id, tokenizer, model
    
    target_model_id = MODELS[model_selection]
        
    if current_model_id == target_model_id:
        return f"✅ {target_model_id} is already active."
    
    # 🧹 Delete old model from memory to prevent server crashes
    if model is not None:
        del model
        del tokenizer
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
    try:
        # Pass the token if provided (required for gated models like TSLAM-4B)
        token_arg = hf_token if hf_token.strip() != "" else None
        
        tokenizer = AutoTokenizer.from_pretrained(target_model_id, token=token_arg)
        
        # Load the model with compression (bfloat16) to save RAM
        model = AutoModelForCausalLM.from_pretrained(
            target_model_id, 
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
            token=token_arg
        )
        
        current_model_id = target_model_id
        return f"✅ Successfully loaded: {target_model_id}. You can now chat!"
        
    except Exception as e:
        if "401 Client Error" in str(e) or "gated repo" in str(e).lower():
            return "❌ Access Denied: This is a gated model. Please accept the terms on the model's Hugging Face page and enter your Access Token in the box above."
        return f"❌ Error loading model: Ensure you have enough RAM/vRAM. Error: {str(e)}"

# 3. The Chat Logic
def generate_response(message, history):
    if model is None or tokenizer is None:
        yield "⚠️ Please select a model from the dropdown and click 'Load Model' before chatting."
        return
        
    prompt = f"User: {message}\nAssistant:"
    inputs = tokenizer(prompt, return_tensors="pt")
    
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=256,
        temperature=0.3,
        top_p=0.9
    )
    
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    response = ""
    for new_text in streamer:
        response += new_text
        yield response

# 4. Build the User Interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    
    gr.HTML("""
        <div style="text-align: center; margin-bottom: 20px;">
            <h2>Telecom AI Model Selector</h2>
            <p>Select a model below. Loading may take a few minutes.</p>
            <p style="color: green; font-weight: bold;">🟢 Green: Runs on Free CPU Tier.</p>
            <p style="color: red; font-weight: bold;">🔴 Red: Requires heavy GPU / Paid Tier.</p>
        </div>
    """)
    
    with gr.Row():
        with gr.Column(scale=3):
            model_dropdown = gr.Dropdown(
                choices=list(MODELS.keys()), 
                value=list(MODELS.keys())[0], 
                label="1. Choose your LLM",
                interactive=True
            )
        with gr.Column(scale=2):
            hf_token_input = gr.Textbox(
                label="2. Hugging Face Token (Optional)", 
                placeholder="Required only for Gated Models...",
                type="password"
            )
        with gr.Column(scale=1):
            # Empty markdown just to push the button down to align with text boxes
            gr.Markdown("<br>")
            load_btn = gr.Button("3. Load Model", variant="primary")
            
    status_text = gr.Textbox(label="System Status", value="Waiting for model to load...", interactive=False)
    
    load_btn.click(fn=load_model, inputs=[model_dropdown, hf_token_input], outputs=status_text)
    
    gr.ChatInterface(
        fn=generate_response,
        examples=["Explain how BGP handles route propagation.", "Write an Elasticsearch DSL query to find high latency."],
    )

demo.launch()