"""
LLM Server - Self-hosted AI model using llama.cpp
Runs a real neural network locally - NO API calls, NO hardcoding
"""

import os
import json
from typing import Optional, Dict, List
from huggingface_hub import hf_hub_download

class LocalLLM:
    """
    Self-hosted LLM using llama.cpp
    Runs entirely locally on CPU - no external API calls
    """
    
    def __init__(self):
        self.llm = None
        self.model_path = None
        self._load_model()
    
    def _load_model(self):
        """Load the model locally"""
        print("="*60)
        print("🧠 Loading AI Model...")
        print("="*60)
        
        # Try to load llama-cpp-python
        try:
            from llama_cpp import Llama
            print("✅ llama-cpp-python available")
            
            # Check for local model file
            local_model = "/app/models/qwen2-0.5b-instruct-q4_k_m.gguf"
            
            if os.path.exists(local_model):
                print(f"📥 Loading model from: {local_model}")
                self.llm = Llama(
                    model_path=local_model,
                    n_ctx=2048,
                    n_threads=4,
                    verbose=False
                )
                self.model_path = local_model
                print("✅ Model loaded successfully!")
            else:
                # Download model on first run
                print("📥 Downloading Qwen2-0.5B model...")
                model_path = hf_hub_download(
                    repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
                    filename="qwen2-0.5b-instruct-q4_k_m.gguf",
                    local_dir="/app/models"
                )
                print(f"✅ Downloaded to: {model_path}")
                self.llm = Llama(
                    model_path=model_path,
                    n_ctx=2048,
                    n_threads=4,
                    verbose=False
                )
                self.model_path = model_path
                print("✅ Model loaded!")
                
        except ImportError:
            print("❌ llama-cpp-python not available")
            print("📦 Falling back to Hugging Face API...")
            self.llm = None
        except Exception as e:
            print(f"❌ Model loading failed: {e}")
            self.llm = None
    
    def generate(self, prompt: str, max_tokens: int = 300, temperature: float = 0.7) -> str:
        """Generate text using the local model"""
        if self.llm is None:
            return self._fallback_generate(prompt, max_tokens)
        
        try:
            # Format prompt for Qwen2
            formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
            
            response = self.llm(
                formatted_prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=0.9,
                stop=["<|im_end|>", "<|im_start|>"],
                echo=False
            )
            
            text = response['choices'][0]['text'].strip()
            print(f"🤖 Generated: {text[:100]}...")
            return text
            
        except Exception as e:
            print(f"❌ Generation error: {e}")
            return self._fallback_generate(prompt, max_tokens)
    
    def _fallback_generate(self, prompt: str, max_tokens: int) -> str:
        """Fallback to Hugging Face Inference API if local model fails"""
        import requests
        
        token = os.environ.get("HF_TOKEN", "")
        if not token:
            return "ERROR: No AI model available. Set HF_TOKEN or ensure model is downloaded."
        
        headers = {
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json"
        }
        
        # Try free models
        models = [
            "Qwen/Qwen2-0.5B-Instruct",
            "microsoft/Phi-3-mini-4k-instruct",
            "HuggingFaceH4/zephyr-7b-beta"
        ]
        
        for model in models:
            try:
                url = f"https://api-inference.huggingface.co/models/{model}"
                payload = {
                    "inputs": prompt,
                    "parameters": {
                        "max_new_tokens": max_tokens,
                        "temperature": 0.7,
                        "return_full_text": False
                    }
                }
                
                resp = requests.post(url, headers=headers, json=payload, timeout=30)
                
                if resp.status_code == 200:
                    result = resp.json()
                    if isinstance(result, list) and result:
                        text = result[0].get("generated_text", "")
                        if text:
                            return text
            except:
                continue
        
        return "ERROR: Could not generate response"
    
    def is_loaded(self) -> bool:
        return self.llm is not None


# Global instance
_llm_instance = None

def get_llm() -> LocalLLM:
    global _llm_instance
    if _llm_instance is None:
        _llm_instance = LocalLLM()
    return _llm_instance