| import gradio as gr |
| from fastapi import FastAPI, Request |
| from llama_cpp import Llama |
| import uvicorn |
| import threading |
|
|
| |
| llm = Llama.from_pretrained( |
| repo_id="tensorblock/WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-GGUF", |
| filename="WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-Q4_K_M.gguf", |
| n_ctx=2048, |
| n_threads=2 |
| ) |
|
|
| |
| app = FastAPI() |
|
|
| @app.post("/v1/chat/completions") |
| async def chat_completions(request: Request): |
| body = await request.json() |
| messages = body.get("messages", []) |
| prompt = f"<|im_start|>user\n{messages[-1]['content']}<|im_end|>\n<|im_start|>assistant\n" |
| |
| response = llm(prompt, max_tokens=512, stop=["<|im_end|>"]) |
| content = response["choices"][0]["text"] |
| |
| return { |
| "choices": [{"message": {"role": "assistant", "content": content}}], |
| "model": "whiterabbitneo" |
| } |
|
|
| |
| def gf_chat(msg, history): |
| return llm(f"<|im_start|>user\n{msg}<|im_end|>\n<|im_start|>assistant\n", max_tokens=512)["choices"][0]["text"] |
|
|
| gui = gr.ChatInterface(fn=gf_chat) |
|
|
| |
| if __name__ == "__main__": |
| |
| threading.Thread(target=uvicorn.run, kwargs={"app": app, "host": "0.0.0.0", "port": 8000}).start() |
| |
| gui.launch(server_port=7860) |
|
|