Spaces:
Paused
Paused
Z User
Update Cortex AI: Add transformers, torch deps and improved AI prompts for WAEC/JAMB/NECO
e88cd1b | """ | |
| LLM Server - Self-hosted AI model using llama.cpp | |
| Runs a real neural network locally - NO API calls, NO hardcoding | |
| """ | |
| import os | |
| import json | |
| from typing import Optional, Dict, List | |
| from huggingface_hub import hf_hub_download | |
| class LocalLLM: | |
| """ | |
| Self-hosted LLM using llama.cpp | |
| Runs entirely locally on CPU - no external API calls | |
| """ | |
| def __init__(self): | |
| self.llm = None | |
| self.model_path = None | |
| self._load_model() | |
| def _load_model(self): | |
| """Load the model locally""" | |
| print("="*60) | |
| print("π§ Loading AI Model...") | |
| print("="*60) | |
| # Try to load llama-cpp-python | |
| try: | |
| from llama_cpp import Llama | |
| print("β llama-cpp-python available") | |
| # Check for local model file | |
| local_model = "/app/models/qwen2-0.5b-instruct-q4_k_m.gguf" | |
| if os.path.exists(local_model): | |
| print(f"π₯ Loading model from: {local_model}") | |
| self.llm = Llama( | |
| model_path=local_model, | |
| n_ctx=2048, | |
| n_threads=4, | |
| verbose=False | |
| ) | |
| self.model_path = local_model | |
| print("β Model loaded successfully!") | |
| else: | |
| # Download model on first run | |
| print("π₯ Downloading Qwen2-0.5B model...") | |
| model_path = hf_hub_download( | |
| repo_id="Qwen/Qwen2-0.5B-Instruct-GGUF", | |
| filename="qwen2-0.5b-instruct-q4_k_m.gguf", | |
| local_dir="/app/models" | |
| ) | |
| print(f"β Downloaded to: {model_path}") | |
| self.llm = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, | |
| n_threads=4, | |
| verbose=False | |
| ) | |
| self.model_path = model_path | |
| print("β Model loaded!") | |
| except ImportError: | |
| print("β llama-cpp-python not available") | |
| print("π¦ Falling back to Hugging Face API...") | |
| self.llm = None | |
| except Exception as e: | |
| print(f"β Model loading failed: {e}") | |
| self.llm = None | |
| def generate(self, prompt: str, max_tokens: int = 300, temperature: float = 0.7) -> str: | |
| """Generate text using the local model""" | |
| if self.llm is None: | |
| return self._fallback_generate(prompt, max_tokens) | |
| try: | |
| # Format prompt for Qwen2 | |
| formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" | |
| response = self.llm( | |
| formatted_prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.9, | |
| stop=["<|im_end|>", "<|im_start|>"], | |
| echo=False | |
| ) | |
| text = response['choices'][0]['text'].strip() | |
| print(f"π€ Generated: {text[:100]}...") | |
| return text | |
| except Exception as e: | |
| print(f"β Generation error: {e}") | |
| return self._fallback_generate(prompt, max_tokens) | |
| def _fallback_generate(self, prompt: str, max_tokens: int) -> str: | |
| """Fallback to Hugging Face Inference API if local model fails""" | |
| import requests | |
| token = os.environ.get("HF_TOKEN", "") | |
| if not token: | |
| return "ERROR: No AI model available. Set HF_TOKEN or ensure model is downloaded." | |
| headers = { | |
| "Authorization": f"Bearer {token}", | |
| "Content-Type": "application/json" | |
| } | |
| # Try free models | |
| models = [ | |
| "Qwen/Qwen2-0.5B-Instruct", | |
| "microsoft/Phi-3-mini-4k-instruct", | |
| "HuggingFaceH4/zephyr-7b-beta" | |
| ] | |
| for model in models: | |
| try: | |
| url = f"https://api-inference.huggingface.co/models/{model}" | |
| payload = { | |
| "inputs": prompt, | |
| "parameters": { | |
| "max_new_tokens": max_tokens, | |
| "temperature": 0.7, | |
| "return_full_text": False | |
| } | |
| } | |
| resp = requests.post(url, headers=headers, json=payload, timeout=30) | |
| if resp.status_code == 200: | |
| result = resp.json() | |
| if isinstance(result, list) and result: | |
| text = result[0].get("generated_text", "") | |
| if text: | |
| return text | |
| except: | |
| continue | |
| return "ERROR: Could not generate response" | |
| def is_loaded(self) -> bool: | |
| return self.llm is not None | |
| # Global instance | |
| _llm_instance = None | |
| def get_llm() -> LocalLLM: | |
| global _llm_instance | |
| if _llm_instance is None: | |
| _llm_instance = LocalLLM() | |
| return _llm_instance | |