import os from huggingface_hub import hf_hub_download from llama_cpp import Llama # Configuration REPO_ID = "MaziyarPanahi/Llama-3-8B-Instruct-v0.3-GGUF" FILENAME = "Llama-3-8B-Instruct-v0.3.Q4_K_M.gguf" MODEL_PATH = os.path.join(os.path.dirname(__file__), "models", FILENAME) def get_model(): """ Downloads the model if not present, then loads it into memory. Returns a Llama instance. """ if not os.path.exists(MODEL_PATH): print(f"⬇️ Model not found. Downloading {FILENAME} from Hugging Face...") os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True) hf_hub_download( repo_id=REPO_ID, filename=FILENAME, local_dir=os.path.dirname(MODEL_PATH), local_dir_use_symlinks=False ) print("✅ Download complete.") else: print(f"✅ Model found at {MODEL_PATH}") print("🚀 Loading Llama-3 into memory (CPU Mode)...") # Initialize Llama (Free Tier: 2 vCPU, 16GB RAM) # n_ctx=2048 (Context window) llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=2, # Optimizing for HF Spaces Free Tier verbose=False ) return llm # Global instance for re-use _llm_instance = None def generate_response(prompt: str, system_prompt: str = "") -> str: global _llm_instance if _llm_instance is None: _llm_instance = get_model() full_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" output = _llm_instance( full_prompt, max_tokens=512, stop=["<|eot_id|>"], echo=False ) return output['choices'][0]['text']