Spaces:

4n0s
/

rabbit

Build error

4n0s commited on Feb 12

Commit

10d0db2

verified ·

1 Parent(s): c187b82

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,18 +1,43 @@
 import gradio as gr
 from llama_cpp import Llama
-# Downloads the 4-bit GGUF model from Hugging Face
 llm = Llama.from_pretrained(
     repo_id="tensorblock/WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-GGUF",
     filename="WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-Q4_K_M.gguf",
     n_ctx=2048,
-    n_threads=2 # CPU Basic has 2 vCPUs
 )
-def generate(message, history):
-    # Basic ChatML formatting for Qwen
-    prompt = f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
-    output = llm(prompt, max_tokens=512, stop=["<|im_end|>"], echo=False)
-    return output["choices"][0]["text"]
-gr.ChatInterface(fn=generate).launch()

 import gradio as gr
+from fastapi import FastAPI, Request
 from llama_cpp import Llama
+import uvicorn
+import threading
+# 1. Load the model (Quantized for 16GB RAM limit)
 llm = Llama.from_pretrained(
     repo_id="tensorblock/WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-GGUF",
     filename="WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-Q4_K_M.gguf",
     n_ctx=2048,
+    n_threads=2
 )
+# 2. FastAPI Setup (OpenAI Wrapper)
+app = FastAPI()
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    body = await request.json()
+    messages = body.get("messages", [])
+    prompt = f"<|im_start|>user\n{messages[-1]['content']}<|im_end|>\n<|im_start|>assistant\n"
+    response = llm(prompt, max_tokens=512, stop=["<|im_end|>"])
+    content = response["choices"][0]["text"]
+    return {
+        "choices": [{"message": {"role": "assistant", "content": content}}],
+        "model": "whiterabbitneo"
+    }
+# 3. Gradio Interface (Required by HF Spaces)
+def gf_chat(msg, history):
+    return llm(f"<|im_start|>user\n{msg}<|im_end|>\n<|im_start|>assistant\n", max_tokens=512)["choices"][0]["text"]
+gui = gr.ChatInterface(fn=gf_chat)
+# 4. Launch both
+if __name__ == "__main__":
+    # Run FastAPI in a background thread
+    threading.Thread(target=uvicorn.run, kwargs={"app": app, "host": "0.0.0.0", "port": 8000}).start()
+    # Run Gradio on the standard port
+    gui.launch(server_port=7860)