import os import gc import llama_cpp from llama_cpp import Llama class MairaBrain: def __init__(self, repo_id, filename): self.repo_id = repo_id self.filename = filename self.llm = None def load(self): """Wakes the core with Turbo settings""" if self.llm is None: print(f"๐Ÿš€ TURBO LOADING: {self.filename}") model_path = os.path.join("/app", self.filename) self.llm = Llama( model_path=model_path, # ๐ŸŽ๏ธ SPEED TRICK 1: Smaller context (512) makes response start INSTANTLY n_ctx=512, # ๐ŸŽ๏ธ SPEED TRICK 2: Match HF's physical CPU cores (usually 4) n_threads=4, # ๐ŸŽ๏ธ SPEED TRICK 3: Batch processing size n_batch=512, # ๐ŸŽ๏ธ SPEED TRICK 4: Quantize the KV Cache (Moves 50% less data through RAM) type_k=llama_cpp.GGML_TYPE_Q8_0, type_v=llama_cpp.GGML_TYPE_Q8_0, # ๐ŸŽ๏ธ SPEED TRICK 5: Flash Attention (if supported by the specific model) flash_attn=True, use_mmap=True, use_mlock=False, verbose=False ) def unload(self): """Clears the tracks for the next runner""" if self.llm is not None: print(f"๐Ÿงน CLEARING CACHE: {self.filename}") try: self.llm.close() except: pass del self.llm self.llm = None gc.collect() def get_response(self, user_id, user_input): self.load() # Keep the prompt short. Long prompts slow down the "Time to First Token" prompt = f"Maira: I am a high-speed AI core.\\nUser: {user_input}\\nMaira:" # generate tokens output = self.llm( prompt, max_tokens=128, # Short responses feel faster stop=["User:", "\\n"], temperature=0.7, repeat_penalty=1.1 ) return output["choices"][0]["text"].strip()