import os import sqlite3 import logging import anyio import uvicorn from fastapi import FastAPI, Depends, HTTPException, Security from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse from pydantic import BaseModel from llama_cpp import Llama # --- INITIALIZATION --- app = FastAPI(title="Kwen Foundation Official API") # Updated CORS: allow_methods=["*"] is crucial for browser 'OPTIONS' preflight requests app.add_middleware( CORSMiddleware, allow_origins=["https://hydrogenclient.github.io"], # For debugging, allowing all. Change to your specific domain later. allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) security = HTTPBearer() BASE_DIR = os.path.dirname(os.path.abspath(__file__)) MODEL_PATH = os.path.join(BASE_DIR, "kwen-4b-q4_k_m.gguf") DB_PATH = "keys.db" # --- MODEL LOADING --- if os.path.exists(MODEL_PATH): size_gb = os.path.getsize(MODEL_PATH) / (1024**3) logger.info(f"✅ Model found! Size: {size_gb:.2f} GB") llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=2, n_gpu_layers=0, offload_kqv=False, verbose=True ) logger.info("Kwen Model loaded.") else: logger.error(f"❌ Model NOT found at {MODEL_PATH}") llm = None # --- AUTH SETUP --- DEFAULT_KEY = "kwen_NjdT7VQauEyDfzVvPLMIQfwUuduJWT6j" def init_db(): conn = sqlite3.connect(DB_PATH) conn.execute("CREATE TABLE IF NOT EXISTS keys (key TEXT PRIMARY KEY, user TEXT)") cur = conn.execute("SELECT user FROM keys WHERE key = ?", (DEFAULT_KEY,)) if not cur.fetchone(): conn.execute("INSERT INTO keys (key, user) VALUES (?, ?)", (DEFAULT_KEY, "Public_Guest")) conn.commit() conn.close() init_db() class ChatInput(BaseModel): prompt: str max_tokens: int = 512 temperature: float = 0.6 def verify_key(auth: HTTPAuthorizationCredentials = Security(security)): provided_key = auth.credentials.strip() conn = sqlite3.connect(DB_PATH) user = conn.execute("SELECT user FROM keys WHERE key = ?", (provided_key,)).fetchone() conn.close() if not user: logger.warning(f"AUTH FAILED: Access denied for key ending in ...{provided_key[-4:]}") raise HTTPException(status_code=403, detail="Invalid API Key") return user[0] # --- STREAMING ENGINE --- async def kwen_streamer(prompt, max_tokens, temperature): formatted = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" yield "" # Initial heartbeat def run_inference(): return llm( formatted, max_tokens=max_tokens, temperature=temperature, stop=["<|im_end|>", "user:", "assistant:"], stream=True ) try: stream = await anyio.to_thread.run_sync(run_inference) for chunk in stream: token = chunk["choices"][0]["text"] if token: yield token await anyio.sleep(0.01) except Exception as e: logger.error(f"Stream Error: {e}") yield f"\n[Stream Error: {e}]" # --- ENDPOINTS --- @app.get("/") def health(): return {"status": "running", "model": "Kwen-4B", "database": "active"} @app.post("/v1/chat") async def chat(data: ChatInput, user=Depends(verify_key)): if not llm: raise HTTPException(status_code=503, detail="Model Offline") logger.info(f"Authorized Request from: {user}") return StreamingResponse( kwen_streamer(data.prompt, data.max_tokens, data.temperature), media_type="text/event-stream" ) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)