cortex-ai / llm_server.py
Z User
Update Cortex AI: Add transformers, torch deps and improved AI prompts for WAEC/JAMB/NECO
e88cd1b
"""
LLM Server - Self-hosted AI model using llama.cpp
Runs a real neural network locally - NO API calls, NO hardcoding
"""
import os
import json
from typing import Optional, Dict, List
from huggingface_hub import hf_hub_download
class LocalLLM:
"""
Self-hosted LLM using llama.cpp
Runs entirely locally on CPU - no external API calls
"""
def __init__(self):
self.llm = None
self.model_path = None
self._load_model()
def _load_model(self):
"""Load the model locally"""
print("="*60)
print("🧠 Loading AI Model...")
print("="*60)
# Try to load llama-cpp-python
try:
from llama_cpp import Llama
print("βœ… llama-cpp-python available")
# Check for local model file
local_model = "/app/models/qwen2-0.5b-instruct-q4_k_m.gguf"
if os.path.exists(local_model):
print(f"πŸ“₯ Loading model from: {local_model}")
self.llm = Llama(
model_path=local_model,
n_ctx=2048,
n_threads=4,
verbose=False
)
self.model_path = local_model
print("βœ… Model loaded successfully!")
else:
# Download model on first run
print("πŸ“₯ Downloading Qwen2-0.5B model...")
model_path = hf_hub_download(
repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
filename="qwen2-0.5b-instruct-q4_k_m.gguf",
local_dir="/app/models"
)
print(f"βœ… Downloaded to: {model_path}")
self.llm = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=4,
verbose=False
)
self.model_path = model_path
print("βœ… Model loaded!")
except ImportError:
print("❌ llama-cpp-python not available")
print("πŸ“¦ Falling back to Hugging Face API...")
self.llm = None
except Exception as e:
print(f"❌ Model loading failed: {e}")
self.llm = None
def generate(self, prompt: str, max_tokens: int = 300, temperature: float = 0.7) -> str:
"""Generate text using the local model"""
if self.llm is None:
return self._fallback_generate(prompt, max_tokens)
try:
# Format prompt for Qwen2
formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
response = self.llm(
formatted_prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
stop=["<|im_end|>", "<|im_start|>"],
echo=False
)
text = response['choices'][0]['text'].strip()
print(f"πŸ€– Generated: {text[:100]}...")
return text
except Exception as e:
print(f"❌ Generation error: {e}")
return self._fallback_generate(prompt, max_tokens)
def _fallback_generate(self, prompt: str, max_tokens: int) -> str:
"""Fallback to Hugging Face Inference API if local model fails"""
import requests
token = os.environ.get("HF_TOKEN", "")
if not token:
return "ERROR: No AI model available. Set HF_TOKEN or ensure model is downloaded."
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json"
}
# Try free models
models = [
"Qwen/Qwen2-0.5B-Instruct",
"microsoft/Phi-3-mini-4k-instruct",
"HuggingFaceH4/zephyr-7b-beta"
]
for model in models:
try:
url = f"https://api-inference.huggingface.co/models/{model}"
payload = {
"inputs": prompt,
"parameters": {
"max_new_tokens": max_tokens,
"temperature": 0.7,
"return_full_text": False
}
}
resp = requests.post(url, headers=headers, json=payload, timeout=30)
if resp.status_code == 200:
result = resp.json()
if isinstance(result, list) and result:
text = result[0].get("generated_text", "")
if text:
return text
except:
continue
return "ERROR: Could not generate response"
def is_loaded(self) -> bool:
return self.llm is not None
# Global instance
_llm_instance = None
def get_llm() -> LocalLLM:
global _llm_instance
if _llm_instance is None:
_llm_instance = LocalLLM()
return _llm_instance