import os, subprocess, time from typing import Optional import gradio as gr from fastapi import FastAPI, Header, HTTPException from pydantic import BaseModel MODEL_FILE = "Dolphin-X1-8B.Q4_K_M.gguf" MODEL_URL = "https://huggingface.co/dphn/Dolphin-X1-8B-GGUF/resolve/main/Dolphin-X1-8B.Q4_K_M.gguf" SPACE_API_KEY = os.getenv("SPACE_API_KEY") BUILD_DIR = "llama" THREADS = "4" def setup(): if not os.path.exists(MODEL_FILE): subprocess.run(["wget", "-q", MODEL_URL, "-O", MODEL_FILE]) if not os.path.exists("llama.cpp"): subprocess.run(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp"]) if not os.path.exists(BUILD_DIR): subprocess.run(["cmake", "-S", "llama.cpp", "-B", BUILD_DIR]) subprocess.run(["cmake", "--build", BUILD_DIR, "--config", "Release", "-j", THREADS]) setup() app = FastAPI() class Query(BaseModel): q: str @app.post("/api/chat") def chat(q: Query, x_api_key: Optional[str] = Header(None)): if x_api_key != SPACE_API_KEY: raise HTTPException(401, "Unauthorized") p = subprocess.Popen( [f"./{BUILD_DIR}/bin/main", "-m", MODEL_FILE, "-p", q.q, "-n", "200", "-t", THREADS], stdout=subprocess.PIPE ) return {"reply": p.stdout.read().decode(errors="ignore")} def ui_chat(q): p = subprocess.Popen( [f"./{BUILD_DIR}/bin/main", "-m", MODEL_FILE, "-p", q, "-n", "200", "-t", THREADS], stdout=subprocess.PIPE ) return p.stdout.read().decode(errors="ignore") gr.mount_gradio_app(app, gr.Interface(ui_chat, gr.Textbox(), gr.Textbox()), path="/")