import os
from llama_cpp import Llama
import requests
from typing import Generator

class ModelManager:
    def __init__(self):
        self.models = {}
        self.model_configs = {
            "fast-chat": {
                "repo": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
                "file": "qwen2.5-0.5b-instruct-q4_k_m.gguf",
                "url": "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf",
                "format": "chatml"
            },
            "tinyllama": {
                "repo": "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
                "file": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
                "url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
                "format": "tinyllama"
            },
            "coder": {
                "repo": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF",
                "file": "qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
                "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct-GGUF/resolve/main/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf",
                "format": "chatml"
            }
        }
        self.models_dir = os.path.join(os.getcwd(), "models")
        os.makedirs(self.models_dir, exist_ok=True)
        self.critical_models = ["fast-chat"]
        self.auto_download_critical()

    def auto_download_critical(self):
        """Download only critical lightweight models at startup"""
        print("Checking for pre-downloaded models...")
        for model_id in self.critical_models:
            try:
                path = self.download_model(model_id)
                print(f"✓ {model_id} ready")
            except Exception as e:
                print(f"✗ Failed to ensure {model_id}: {e}")

    def download_model(self, model_id: str):
        config = self.model_configs.get(model_id)
        if not config:
            raise ValueError(f"Model {model_id} not configured")
        
        target_path = os.path.join(self.models_dir, config["file"])
        if os.path.exists(target_path) and os.path.getsize(target_path) > 50000000:
            return target_path

        print(f"Downloading {model_id}...")
        try:
            response = requests.get(config["url"], stream=True, timeout=60)
            response.raise_for_status()
            with open(target_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=1024*1024):
                    if chunk:
                        f.write(chunk)
            print(f"✓ {model_id} downloaded")
            return target_path
        except Exception as e:
            if os.path.exists(target_path):
                os.remove(target_path)
            raise e

    def load_model(self, model_id: str):
        if model_id in self.models:
            return self.models[model_id]
        
        path = self.download_model(model_id)
        self.models[model_id] = Llama(
            model_path=path,
            n_ctx=1024,
            n_threads=2,
            verbose=False
        )
        return self.models[model_id]

    def format_prompt(self, model_id: str, system: str, history: list, prompt: str):
        fmt = self.model_configs[model_id]["format"]
        
        if fmt == "chatml":
            full = f"<|im_start|>system\n{system}<|im_end|>\n"
            for msg in history:
                role = "user" if msg["role"] == "user" else "assistant"
                full += f"<|im_start|>{role}\n{msg['content']}<|im_end|>\n"
            full += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
            return full, ["<|im_end|>", "###", "<|im_start|>", "</s>"]

        elif fmt == "tinyllama":
            full = f"<|system|>\n{system}</s>\n"
            for msg in history:
                role = "user" if msg["role"] == "user" else "assistant"
                full += f"<|{role}|>\n{msg['content']}</s>\n"
            full += f"<|user|>\n{prompt}</s>\n<|assistant|>\n"
            return full, ["</s>", "<|user|>", "<|assistant|>"]

        return prompt, ["</s>"]

    def generate_stream(self, model_id: str, prompt: str, context: list = None, **kwargs) -> Generator[str, None, None]:
        llm = self.load_model(model_id)
        
        system_text = (
            "You are a helpful AI assistant. "
            "For math, use LaTeX with $ $ for display and \\( \\) for inline."
        )
        
        full_prompt, stop_tokens = self.format_prompt(model_id, system_text, context or [], prompt)
        
        params = {
            "max_tokens": kwargs.get("max_tokens", 512),
            "stop": stop_tokens,
            "stream": True,
            "temperature": kwargs.get("temperature", 0.7),
            "top_p": kwargs.get("top_p", 0.95)
        }
        
        for output in llm(full_prompt, **params):
            token = output["choices"][0]["text"]
            yield token

    def cleanup(self):
        """Cleanup resources"""
        for model in self.models.values():
            if hasattr(model, 'close'):
                model.close()
        self.models.clear()