4n0s commited on
Commit
10d0db2
·
verified ·
1 Parent(s): c187b82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -8
app.py CHANGED
@@ -1,18 +1,43 @@
1
  import gradio as gr
 
2
  from llama_cpp import Llama
 
 
3
 
4
- # Downloads the 4-bit GGUF model from Hugging Face
5
  llm = Llama.from_pretrained(
6
  repo_id="tensorblock/WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-GGUF",
7
  filename="WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-Q4_K_M.gguf",
8
  n_ctx=2048,
9
- n_threads=2 # CPU Basic has 2 vCPUs
10
  )
11
 
12
- def generate(message, history):
13
- # Basic ChatML formatting for Qwen
14
- prompt = f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
15
- output = llm(prompt, max_tokens=512, stop=["<|im_end|>"], echo=False)
16
- return output["choices"][0]["text"]
17
 
18
- gr.ChatInterface(fn=generate).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from fastapi import FastAPI, Request
3
  from llama_cpp import Llama
4
+ import uvicorn
5
+ import threading
6
 
7
+ # 1. Load the model (Quantized for 16GB RAM limit)
8
  llm = Llama.from_pretrained(
9
  repo_id="tensorblock/WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-GGUF",
10
  filename="WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-Q4_K_M.gguf",
11
  n_ctx=2048,
12
+ n_threads=2
13
  )
14
 
15
+ # 2. FastAPI Setup (OpenAI Wrapper)
16
+ app = FastAPI()
 
 
 
17
 
18
+ @app.post("/v1/chat/completions")
19
+ async def chat_completions(request: Request):
20
+ body = await request.json()
21
+ messages = body.get("messages", [])
22
+ prompt = f"<|im_start|>user\n{messages[-1]['content']}<|im_end|>\n<|im_start|>assistant\n"
23
+
24
+ response = llm(prompt, max_tokens=512, stop=["<|im_end|>"])
25
+ content = response["choices"][0]["text"]
26
+
27
+ return {
28
+ "choices": [{"message": {"role": "assistant", "content": content}}],
29
+ "model": "whiterabbitneo"
30
+ }
31
+
32
+ # 3. Gradio Interface (Required by HF Spaces)
33
+ def gf_chat(msg, history):
34
+ return llm(f"<|im_start|>user\n{msg}<|im_end|>\n<|im_start|>assistant\n", max_tokens=512)["choices"][0]["text"]
35
+
36
+ gui = gr.ChatInterface(fn=gf_chat)
37
+
38
+ # 4. Launch both
39
+ if __name__ == "__main__":
40
+ # Run FastAPI in a background thread
41
+ threading.Thread(target=uvicorn.run, kwargs={"app": app, "host": "0.0.0.0", "port": 8000}).start()
42
+ # Run Gradio on the standard port
43
+ gui.launch(server_port=7860)