#!/usr/bin/env python3
"""
Hugging Face Spaces Backend - Qwen 1.5B Instruct
Leicht, schnell und speichereffizient
"""

import os
import json
import logging
import gradio as gr
from pathlib import Path
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    pipeline
)
import time

# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_TOKENS = 512
TEMPERATURE = 0.7
TOP_P = 0.9

# Auto-Select best model for available memory
def select_model():
    """Nutze Qwen 1.5B - klein und schnell!"""
    return "Qwen/Qwen2.5-1.5B-Instruct"
        return "TheBloke/zephyr-7B-beta-AWQ"

MODEL_NAME = os.getenv("MODEL_NAME", select_model())
logger.info(f"📌 Using model: {MODEL_NAME}")

# Plugin System
PLUGIN_DIR = Path("plugins")
loaded_plugins = {}

def load_plugins():
    """Load Python plugins from plugins directory"""
    if not PLUGIN_DIR.exists():
        logger.warning(f"Plugin directory {PLUGIN_DIR} does not exist")
        return
    
    for plugin_file in PLUGIN_DIR.glob("*.py"):
        if plugin_file.name.startswith("__"):
            continue
        try:
            plugin_name = plugin_file.stem
            logger.info(f"Loading plugin: {plugin_name}")
            loaded_plugins[plugin_name] = plugin_file
            logger.info(f"✓ Plugin {plugin_name} loaded")
        except Exception as e:
            logger.error(f"✗ Failed to load plugin {plugin_file}: {e}")

def call_plugin_hook(hook_name, *args, **kwargs):
    """Call a plugin hook if it exists"""
    for plugin_name, plugin_file in loaded_plugins.items():
        try:
            pass
        except Exception as e:
            logger.error(f"Error calling hook in {plugin_name}: {e}")

# Initialize Model mit Quantization
logger.info(f"⏳ Loading model {MODEL_NAME} on {DEVICE}...")

def load_model_optimized():
    """Qwen 1.5B - kein Quantization nötig, ist schon klein!"""
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto" if DEVICE == "cuda" else None,
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    )
    
    logger.info(f"✅ {MODEL_NAME} loaded successfully")
    return tokenizer, model

try:
    tokenizer, model = load_model_optimized()
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0 if DEVICE == "cuda" else -1,
    )
    logger.info("✅ Model loaded successfully")
        load_in_8bit=False,
    )
    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0 if DEVICE == "cuda" else -1,
    )
    logger.info("✓ Model loaded successfully")
except Exception as e:
    logger.error(f"✗ Failed to load model: {e}")
    raise

def generate_response(prompt: str, system_prompt: str = None) -> dict:
    """
    Generate response using Zephyr model
    
    Args:
        prompt: User input
        system_prompt: Optional system prompt
    
    Returns:
        dict with response, tokens, and timing
    """
    try:
        start_time = time.time()
        
        # Qwen message format
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": prompt})
        
        # Generate
        outputs = pipe(
            messages,
            max_new_tokens=MAX_TOKENS,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            do_sample=True,
            return_full_text=False,
        )
        
        response_text = outputs[0]["generated_text"].strip()
        elapsed = time.time() - start_time
        
        result = {
            "response": response_text,
            "tokens": len(tokenizer.encode(response_text)),
            "time_seconds": round(elapsed, 2),
            "model": MODEL_NAME,
        }
        
        logger.info(f"Generated {result['tokens']} tokens in {result['time_seconds']}s")
        return result
        
    except Exception as e:
        logger.error(f"Error generating response: {e}")
        return {
            "response": f"Error: {str(e)}",
            "tokens": 0,
            "time_seconds": 0,
            "error": True,
        }

# Load plugins
load_plugins()

# Gradio Interface
with gr.Blocks(title="Zephyr-7B AI Chatbot") as demo:
    gr.Markdown("# 🤖 Zephyr-7B Inference Server")
    gr.Markdown("Powered by Hugging Face & Gradio")
    
    with gr.Row():
        with gr.Column(scale=2):
            system_prompt = gr.Textbox(
                label="System Prompt",
                placeholder="Optional: Definiere die Rolle des Assistenten...",
                lines=3,
            )
        with gr.Column(scale=1):
            temperature = gr.Slider(
                label="Temperature",
                minimum=0.0,
                maximum=2.0,
                value=TEMPERATURE,
                step=0.1,
            )
            top_p = gr.Slider(
                label="Top P",
                minimum=0.0,
                maximum=1.0,
                value=TOP_P,
                step=0.05,
            )
    
    user_input = gr.Textbox(
        label="Your Message",
        placeholder="Type your message...",
        lines=4,
    )
    
    submit_btn = gr.Button("🚀 Generate", variant="primary", size="lg")
    
    output_response = gr.Textbox(
        label="Response",
        interactive=False,
        lines=6,
    )
    
    output_stats = gr.JSON(label="Statistics", interactive=False)
    
    def process_input(prompt, system, temp, top):
        old_temp = globals()["TEMPERATURE"]
        old_top = globals()["TOP_P"]
        globals()["TEMPERATURE"] = temp
        globals()["TOP_P"] = top
        
        result = generate_response(prompt, system)
        
        globals()["TEMPERATURE"] = old_temp
        globals()["TOP_P"] = old_top
        
        stats = {
            "Tokens": result["tokens"],
            "Time (s)": result["time_seconds"],
            "Model": result["model"],
        }
        
        return result["response"], stats
    
    submit_btn.click(
        process_input,
        inputs=[user_input, system_prompt, temperature, top_p],
        outputs=[output_response, output_stats],
    )
    
    # Examples
    gr.Examples(
        examples=[
            ["Was ist Machine Learning?", "Du bist ein hilfsbereiter AI-Assistent.", 0.7, 0.9],
            ["Schreibe einen kurzen Witz", "", 0.9, 0.95],
            ["Erkläre Quantencomputing in einfachen Worten", "Du bist ein Physik-Professor.", 0.7, 0.9],
        ],
        inputs=[user_input, system_prompt, temperature, top_p],
    )

if __name__ == "__main__":
    demo.queue().launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
    )