""" LLM Server - Self-hosted AI model using llama.cpp Runs a real neural network locally - NO API calls, NO hardcoding """ import os import json from typing import Optional, Dict, List from huggingface_hub import hf_hub_download class LocalLLM: """ Self-hosted LLM using llama.cpp Runs entirely locally on CPU - no external API calls """ def __init__(self): self.llm = None self.model_path = None self._load_model() def _load_model(self): """Load the model locally""" print("="*60) print("🧠 Loading AI Model...") print("="*60) # Try to load llama-cpp-python try: from llama_cpp import Llama print("✅ llama-cpp-python available") # Check for local model file local_model = "/app/models/qwen2-0.5b-instruct-q4_k_m.gguf" if os.path.exists(local_model): print(f"📥 Loading model from: {local_model}") self.llm = Llama( model_path=local_model, n_ctx=2048, n_threads=4, verbose=False ) self.model_path = local_model print("✅ Model loaded successfully!") else: # Download model on first run print("📥 Downloading Qwen2-0.5B model...") model_path = hf_hub_download( repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", filename="qwen2-0.5b-instruct-q4_k_m.gguf", local_dir="/app/models" ) print(f"✅ Downloaded to: {model_path}") self.llm = Llama( model_path=model_path, n_ctx=2048, n_threads=4, verbose=False ) self.model_path = model_path print("✅ Model loaded!") except ImportError: print("❌ llama-cpp-python not available") print("📦 Falling back to Hugging Face API...") self.llm = None except Exception as e: print(f"❌ Model loading failed: {e}") self.llm = None def generate(self, prompt: str, max_tokens: int = 300, temperature: float = 0.7) -> str: """Generate text using the local model""" if self.llm is None: return self._fallback_generate(prompt, max_tokens) try: # Format prompt for Qwen2 formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" response = self.llm( formatted_prompt, max_tokens=max_tokens, temperature=temperature, top_p=0.9, stop=["<|im_end|>", "<|im_start|>"], echo=False ) text = response['choices'][0]['text'].strip() print(f"🤖 Generated: {text[:100]}...") return text except Exception as e: print(f"❌ Generation error: {e}") return self._fallback_generate(prompt, max_tokens) def _fallback_generate(self, prompt: str, max_tokens: int) -> str: """Fallback to Hugging Face Inference API if local model fails""" import requests token = os.environ.get("HF_TOKEN", "") if not token: return "ERROR: No AI model available. Set HF_TOKEN or ensure model is downloaded." headers = { "Authorization": f"Bearer {token}", "Content-Type": "application/json" } # Try free models models = [ "Qwen/Qwen2-0.5B-Instruct", "microsoft/Phi-3-mini-4k-instruct", "HuggingFaceH4/zephyr-7b-beta" ] for model in models: try: url = f"https://api-inference.huggingface.co/models/{model}" payload = { "inputs": prompt, "parameters": { "max_new_tokens": max_tokens, "temperature": 0.7, "return_full_text": False } } resp = requests.post(url, headers=headers, json=payload, timeout=30) if resp.status_code == 200: result = resp.json() if isinstance(result, list) and result: text = result[0].get("generated_text", "") if text: return text except: continue return "ERROR: Could not generate response" def is_loaded(self) -> bool: return self.llm is not None # Global instance _llm_instance = None def get_llm() -> LocalLLM: global _llm_instance if _llm_instance is None: _llm_instance = LocalLLM() return _llm_instance