Spaces:

fuhaddesmond
/

cortex-ai

Paused

cortex-ai / llm_server.py

Z User

Update Cortex AI: Add transformers, torch deps and improved AI prompts for WAEC/JAMB/NECO

e88cd1b 3 months ago

5.17 kB

	"""
	LLM Server - Self-hosted AI model using llama.cpp
	Runs a real neural network locally - NO API calls, NO hardcoding
	"""

	import os
	import json
	from typing import Optional, Dict, List
	from huggingface_hub import hf_hub_download

	class LocalLLM:
	"""
	Self-hosted LLM using llama.cpp
	Runs entirely locally on CPU - no external API calls
	"""

	def __init__(self):
	self.llm = None
	self.model_path = None
	self._load_model()

	def _load_model(self):
	"""Load the model locally"""
	print("="*60)
	print("🧠 Loading AI Model...")
	print("="*60)

	# Try to load llama-cpp-python
	try:
	from llama_cpp import Llama
	print("✅ llama-cpp-python available")

	# Check for local model file
	local_model = "/app/models/qwen2-0.5b-instruct-q4_k_m.gguf"

	if os.path.exists(local_model):
	print(f"📥 Loading model from: {local_model}")
	self.llm = Llama(
	model_path=local_model,
	n_ctx=2048,
	n_threads=4,
	verbose=False
	)
	self.model_path = local_model
	print("✅ Model loaded successfully!")
	else:
	# Download model on first run
	print("📥 Downloading Qwen2-0.5B model...")
	model_path = hf_hub_download(
	repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF",
	filename="qwen2-0.5b-instruct-q4_k_m.gguf",
	local_dir="/app/models"
	)
	print(f"✅ Downloaded to: {model_path}")
	self.llm = Llama(
	model_path=model_path,
	n_ctx=2048,
	n_threads=4,
	verbose=False
	)
	self.model_path = model_path
	print("✅ Model loaded!")

	except ImportError:
	print("❌ llama-cpp-python not available")
	print("📦 Falling back to Hugging Face API...")
	self.llm = None
	except Exception as e:
	print(f"❌ Model loading failed: {e}")
	self.llm = None

	def generate(self, prompt: str, max_tokens: int = 300, temperature: float = 0.7) -> str:
	"""Generate text using the local model"""
	if self.llm is None:
	return self._fallback_generate(prompt, max_tokens)

	try:
	# Format prompt for Qwen2
	formatted_prompt = f"<\|im_start\|>user\n{prompt}<\|im_end\|>\n<\|im_start\|>assistant\n"

	response = self.llm(
	formatted_prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=0.9,
	stop=["<\|im_end\|>", "<\|im_start\|>"],
	echo=False
	)

	text = response['choices'][0]['text'].strip()
	print(f"🤖 Generated: {text[:100]}...")
	return text

	except Exception as e:
	print(f"❌ Generation error: {e}")
	return self._fallback_generate(prompt, max_tokens)

	def _fallback_generate(self, prompt: str, max_tokens: int) -> str:
	"""Fallback to Hugging Face Inference API if local model fails"""
	import requests

	token = os.environ.get("HF_TOKEN", "")
	if not token:
	return "ERROR: No AI model available. Set HF_TOKEN or ensure model is downloaded."

	headers = {
	"Authorization": f"Bearer {token}",
	"Content-Type": "application/json"
	}

	# Try free models
	models = [
	"Qwen/Qwen2-0.5B-Instruct",
	"microsoft/Phi-3-mini-4k-instruct",
	"HuggingFaceH4/zephyr-7b-beta"
	]

	for model in models:
	try:
	url = f"https://api-inference.huggingface.co/models/{model}"
	payload = {
	"inputs": prompt,
	"parameters": {
	"max_new_tokens": max_tokens,
	"temperature": 0.7,
	"return_full_text": False
	}
	}

	resp = requests.post(url, headers=headers, json=payload, timeout=30)

	if resp.status_code == 200:
	result = resp.json()
	if isinstance(result, list) and result:
	text = result[0].get("generated_text", "")
	if text:
	return text
	except:
	continue

	return "ERROR: Could not generate response"

	def is_loaded(self) -> bool:
	return self.llm is not None


	# Global instance
	_llm_instance = None

	def get_llm() -> LocalLLM:
	global _llm_instance
	if _llm_instance is None:
	_llm_instance = LocalLLM()
	return _llm_instance