"""
GGUF Model implementation using llama-cpp-python.
Highly optimized for CPU inference.
"""

import os
import asyncio
import traceback
from typing import List, Dict, Any, Optional
from app.models.base_llm import BaseLLM

try:
    from llama_cpp import Llama, LlamaGrammar
    HAS_LLAMA_CPP = True
except ImportError:
    HAS_LLAMA_CPP = False
    LlamaGrammar = None


class LlamaCppModel(BaseLLM):
    """
    Wrapper for GGUF models using llama.cpp.
    Provides significant speedups on CPU compared to Transformers.
    """
    
    def __init__(self, name: str, model_id: str, model_path: str = None, n_ctx: int = 4096, grammar_path: str = None, n_gpu_layers: int = -1):
        super().__init__(name, model_id)
        self.model_path = model_path
        self.n_ctx = n_ctx
        self.grammar_path = grammar_path
        self.n_gpu_layers = n_gpu_layers
        self.default_grammar = None  # Will be loaded from file if provided
        self.llm = None
        self._response_cache = {}
        self._max_cache_size = 100
        
        if not HAS_LLAMA_CPP:
            raise ImportError("llama-cpp-python is not installed. Cannot use GGUF models.")

    async def initialize(self) -> None:
        """Load GGUF model."""
        if self._initialized:
            return

        if not self.model_path or not os.path.exists(self.model_path):
             # If exact path isn't provided, try to find it in the model directory
             # logic handled in registry usually, but safety check here
             raise FileNotFoundError(f"GGUF model file not found at: {self.model_path}")

        try:
            print(f"[{self.name}] Loading GGUF model from: {self.model_path}")
            print(f"[{self.name}] File size: {os.path.getsize(self.model_path) / (1024*1024):.2f} MB")
            print(f"[{self.name}] n_ctx={self.n_ctx}, n_threads={os.cpu_count()}, n_gpu_layers={self.n_gpu_layers}")
            
            # Load model in a thread to avoid blocking event loop
            # Enable verbose to see llama.cpp errors
            self.llm = await asyncio.to_thread(
                Llama,
                model_path=self.model_path,
                n_ctx=self.n_ctx,
                n_threads=os.cpu_count(), # Use all available cores
                n_gpu_layers=self.n_gpu_layers,  # GPU layer offloading
                verbose=True  # Enable verbose to see loading errors
            )
            
            self._initialized = True
            print(f"[{self.name}] GGUF Model loaded successfully (n_ctx={self.n_ctx}, n_gpu_layers={self.n_gpu_layers})")
            
            # Load grammar file if provided
            if self.grammar_path:
                grammar_full_path = os.path.join(os.path.dirname(__file__), "..", "logic", self.grammar_path)
                if os.path.exists(grammar_full_path):
                    with open(grammar_full_path, 'r', encoding='utf-8') as f:
                        self.default_grammar = f.read()
                    print(f"[{self.name}] Loaded grammar from: {grammar_full_path}")
                else:
                    print(f"[{self.name}] Grammar file not found: {grammar_full_path}")
            
        except Exception as e:
            error_msg = str(e) if str(e) else repr(e)
            print(f"[{self.name}] Failed to load GGUF model: {error_msg}")
            print(f"[{self.name}] Full traceback:")
            traceback.print_exc()
            raise RuntimeError(f"Failed to load GGUF model: {error_msg}") from e

    async def generate(
        self,
        prompt: str = None,
        chat_messages: List[Dict[str, str]] = None,
        max_new_tokens: int = 150,
        temperature: float = 0.7,
        top_p: float = 0.9,
        grammar: str = None,
        **kwargs
    ) -> str:
        """Generate text using llama.cpp
        
        Args:
            prompt: Simple text prompt (converted to user message)
            chat_messages: List of chat messages with role/content
            max_new_tokens: Maximum tokens to generate
            temperature: Sampling temperature (lower = more deterministic)
            top_p: Nucleus sampling threshold
            grammar: Optional GBNF grammar string to constrain output
        """
        
        if not self._initialized or self.llm is None:
            raise RuntimeError(f"[{self.name}] Model not initialized")
            
        # Ensure we have a list of messages
        messages = chat_messages
        if not messages and prompt:
            messages = [{"role": "user", "content": prompt}]
        
        if not messages:
            raise ValueError("Either prompt or chat_messages required")

        # Cache Check - using stringified messages for the key
        import json
        cache_key = f"{json.dumps(messages)}_{max_new_tokens}_{temperature}_{top_p}_{grammar is not None}"
        if cache_key in self._response_cache:
            return self._response_cache[cache_key]

        print(f"DEBUG: Generating with messages: {messages}", flush=True)
        if grammar:
            print(f"DEBUG: Using GBNF grammar constraint", flush=True)

        # Prepare grammar object if provided
        llama_grammar = None
        if grammar and LlamaGrammar:
            try:
                llama_grammar = LlamaGrammar.from_string(grammar)
            except Exception as e:
                print(f"DEBUG: Failed to parse grammar: {e}", flush=True)
                llama_grammar = None

        # Generate using chat completion to leverage internal templates
        output = await asyncio.to_thread(
            self.llm.create_chat_completion,
            messages=messages,
            max_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            grammar=llama_grammar,
        )
        
        print(f"DEBUG: Raw output object: {output}", flush=True)
        
        response_text = output['choices'][0]['message']['content'].strip()
        print(f"DEBUG: Extracted text: {response_text}", flush=True)

        # Cache Store
        if len(self._response_cache) >= self._max_cache_size:
            first_key = next(iter(self._response_cache))
            del self._response_cache[first_key]
        self._response_cache[cache_key] = response_text
        
        return response_text

    def get_info(self) -> Dict[str, Any]:
        """Return model information for /models endpoint."""
        return {
            "name": self.name,
            "model_id": self.model_id,
            "type": "gguf",
            "backend": "llama.cpp",
            "context_length": self.n_ctx,
            "loaded": self._initialized,
            "model_path": self.model_path,
            "has_grammar": self.default_grammar is not None,
            "gpu_layers": self.n_gpu_layers
        }

    async def cleanup(self) -> None:
        """Free memory."""
        if self.llm:
            del self.llm
            self.llm = None
        self._initialized = False
        print(f"[{self.name}] GGUF Model unloaded")