Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from llama_cpp import Llama | |
| import os | |
| import json | |
| from fastapi import FastAPI, Request | |
| from fastapi.responses import JSONResponse, StreamingResponse, RedirectResponse | |
| import uvicorn | |
| # 1. Load Model | |
| model_path = "model.gguf" | |
| print(f"Loading model from {model_path}...") | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=1024, # Drastically reduced context size (saves memory/time on CPU) | |
| n_threads=8, # Maximize all available vCPUs | |
| n_threads_batch=8, # Speed up prompt processing | |
| n_batch=256, # Optimize batch size for prompt evaluation | |
| verbose=False | |
| ) | |
| # 2. FastAPI Setup | |
| app = FastAPI() | |
| def read_root(): | |
| return RedirectResponse(url="/ui") | |
| def health(): | |
| return {"status": "ok"} | |
| async def chat_completions(request: Request): | |
| data = await request.json() | |
| messages = data.get("messages", []) | |
| stream = data.get("stream", False) | |
| # Simple prompt builder | |
| prompt = "" | |
| for m in messages: | |
| role = m.get("role", "user") | |
| content = m.get("content", "") | |
| prompt += f"{role.capitalize()}: {content}\n" | |
| prompt += "Assistant:" | |
| if not stream: | |
| output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024) | |
| text = output['choices'][0]['text'] | |
| return JSONResponse({ | |
| "choices": [{"message": {"content": text.strip()}}] | |
| }) | |
| else: | |
| def generate(): | |
| output = llm(prompt, stop=["User:", "Assistant:"], max_tokens=1024, stream=True) | |
| for chunk in output: | |
| text = chunk['choices'][0]['text'] | |
| yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n" | |
| yield "data: [DONE]\n\n" | |
| return StreamingResponse(generate(), media_type="text/event-stream") | |
| # 3. Gradio UI Setup | |
| def predict(message, history): | |
| prompt = "" | |
| for user_msg, assistant_msg in history: | |
| prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n" | |
| prompt += f"User: {message}\nAssistant:" | |
| output = llm(prompt, max_tokens=1024, stop=["User:"], echo=False, stream=True) | |
| response = "" | |
| for chunk in output: | |
| delta = chunk['choices'][0]['text'] | |
| response += delta | |
| yield response | |
| demo = gr.ChatInterface( | |
| fn=predict, | |
| title="VisamIntelli-Flash", | |
| description="Your private AI brain on Hugging Face.", | |
| ) | |
| # 4. Mount Gradio to FastAPI at /ui | |
| app = gr.mount_gradio_app(app, demo, path="/ui") | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |