import gradio as gr from llama_cpp import Llama import os import json from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, StreamingResponse, RedirectResponse import uvicorn # 1. Load Model model_path = "model.gguf" print(f"Loading model from {model_path}...") llm = Llama( model_path=model_path, n_ctx=1024, # Drastically reduced context size (saves memory/time on CPU) n_threads=8, # Maximize all available vCPUs n_threads_batch=8, # Speed up prompt processing n_batch=256, # Optimize batch size for prompt evaluation verbose=False ) # 2. FastAPI Setup app = FastAPI() @app.get("/") def read_root(): return RedirectResponse(url="/ui") @app.get("/health") def health(): return {"status": "ok"} @app.post("/v1/chat/completions") async def chat_completions(request: Request): data = await request.json() messages = data.get("messages", []) stream = data.get("stream", False) # Simple prompt builder prompt = "" for m in messages: role = m.get("role", "user") content = m.get("content", "") prompt += f"{role.capitalize()}: {content}\n" prompt += "Assistant:" if not stream: output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024) text = output['choices'][0]['text'] return JSONResponse({ "choices": [{"message": {"content": text.strip()}}] }) else: def generate(): output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024, stream=True) for chunk in output: text = chunk['choices'][0]['text'] yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(generate(), media_type="text/event-stream") # 3. Gradio UI Setup def predict(message, history): prompt = "" for user_msg, assistant_msg in history: prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n" prompt += f"User: {message}\nAssistant:" output = llm(prompt, max_tokens=1024, stop=["User:"], echo=False, stream=True) response = "" for chunk in output: delta = chunk['choices'][0]['text'] response += delta yield response demo = gr.ChatInterface( fn=predict, title="VisamIntelli-Flash", description="Your private AI brain on Hugging Face.", ) # 4. Mount Gradio to FastAPI at /ui app = gr.mount_gradio_app(app, demo, path="/ui") if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)