File size: 3,314 Bytes
eaeb0c2 d0c5b7c cbd0c9c eaeb0c2 1c348b1 eaeb0c2 5ec3978 d0c5b7c 5ec3978 d0c5b7c eaeb0c2 2b178c7 eaeb0c2 d0c5b7c eaeb0c2 d0c5b7c 5ec3978 eaeb0c2 d0c5b7c eaeb0c2 5ec3978 eaeb0c2 5ec3978 eaeb0c2 ae39ee3 eaeb0c2 ae39ee3 d0c5b7c cb317d4 d0c5b7c eaeb0c2 d0c5b7c eaeb0c2 d0c5b7c eaeb0c2 d0c5b7c eaeb0c2 dc23d91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | import os
import requests
from pathlib import Path
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_cpp import Llama
import logging
# ================= CONFIG =================
MODEL_URL = "https://huggingface.co/Neon-AI/Kushina/resolve/main/model.gguf"
MODEL_PATH = "model.gguf"
N_CTX = 16384
N_THREADS = 4
N_BATCH = 256
MAX_TOKENS = 16384
TEMPERATURE = 0.7
TOP_P = 0.9
# ==========================================
# ---------- Logging setup ----------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger("KushinaAPI")
app = FastAPI(title="Kushina API", version="1.0")
llm = None # lazy-loaded
# ---------- Download GGUF if not present ----------
if not Path(MODEL_PATH).exists():
try:
logger.info("Downloading model.gguf from Hugging Face...")
r = requests.get(MODEL_URL, stream=True)
r.raise_for_status()
with open(MODEL_PATH, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
logger.info("Download complete ✅")
except Exception as e:
logger.exception("Failed to download model.gguf")
raise RuntimeError(f"Failed to download model.gguf: {e}")
# ---------- Lazy load llama.cpp ----------
def get_llm():
global llm
if llm is None:
try:
logger.info("Loading GGUF model into llama.cpp…")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=N_CTX,
n_threads=N_THREADS,
n_batch=N_BATCH,
f16_kv=True,
use_mmap=True,
verbose=False,
)
logger.info("Model loaded ✅")
except Exception as e:
logger.exception("Failed to load GGUF model")
raise RuntimeError(f"Failed to load GGUF model: {e}")
return llm
# ---------- Request schema ----------
class PromptRequest(BaseModel):
prompt: str
# ---------- System prompt ----------
SYSTEM_PROMPT = """You are Kushina.
Modes: CHAT or CODE
Rules:
- CHAT: mirror user tone, short responses, no explanations unless asked.
- CODE: output only code when user asks, no commentary.
Switch to CODE if user asks for code, script, function, program, website, api, algorithm, app.
Otherwise use CHAT.
"""
def build_prompt(user_text: str) -> str:
return f"<|system|>\n{SYSTEM_PROMPT}\n<|user|>\n{user_text}\n<|assistant|>\n"
# ---------- API endpoints ----------
@app.get("/")
def root():
return {"status": "ok"}
@app.post("/generate")
def generate(req: PromptRequest):
try:
llm_instance = get_llm() # lazy load
full_prompt = build_prompt(req.prompt)
output_text = ""
for chunk in llm_instance(
full_prompt,
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
top_p=TOP_P,
stream=True,
stop=["<|user|>", "<|system|>"],
):
if "choices" in chunk:
output_text += chunk["choices"][0]["text"]
return {"response": output_text}
except Exception as e:
# Instead of raising 500, return JSON with error
print("❌ Error during generation:", e)
return {"error": str(e)} |