import os
import gc
import llama_cpp
from llama_cpp import Llama

class MairaBrain:
    def __init__(self, repo_id, filename):
        self.repo_id = repo_id
        self.filename = filename
        self.llm = None

    def load(self):
        """Wakes the core with Turbo settings"""
        if self.llm is None:
            print(f"🚀 TURBO LOADING: {self.filename}")
            model_path = os.path.join("/app", self.filename)
            
            self.llm = Llama(
                model_path=model_path,
                # 🏎️ SPEED TRICK 1: Smaller context (512) makes response start INSTANTLY
                n_ctx=512,             
                # 🏎️ SPEED TRICK 2: Match HF's physical CPU cores (usually 4)
                n_threads=4,           
                # 🏎️ SPEED TRICK 3: Batch processing size
                n_batch=512,           
                # 🏎️ SPEED TRICK 4: Quantize the KV Cache (Moves 50% less data through RAM)
                type_k=llama_cpp.GGML_TYPE_Q8_0, 
                type_v=llama_cpp.GGML_TYPE_Q8_0,
                # 🏎️ SPEED TRICK 5: Flash Attention (if supported by the specific model)
                flash_attn=True,
                use_mmap=True,
                use_mlock=False,
                verbose=False
            )

    def unload(self):
        """Clears the tracks for the next runner"""
        if self.llm is not None:
            print(f"🧹 CLEARING CACHE: {self.filename}")
            try:
                self.llm.close()
            except:
                pass
            del self.llm
            self.llm = None
            gc.collect()

    def get_response(self, user_id, user_input):
        self.load()
        
        # Keep the prompt short. Long prompts slow down the "Time to First Token"
        prompt = f"Maira: I am a high-speed AI core.\\nUser: {user_input}\\nMaira:"
        
        # generate tokens
        output = self.llm(
            prompt,
            max_tokens=128, # Short responses feel faster
            stop=["User:", "\\n"],
            temperature=0.7,
            repeat_penalty=1.1
        )
        
        return output["choices"][0]["text"].strip()