mikkhan's picture
Upload server.py with huggingface_hub
fa3e5a5 verified
# M-Wellness Reasoning API v1.0.1
import os
import json
from flask import Flask, request, Response, stream_with_context
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
app = Flask(__name__)
# Model: DeepSeek-R1-Distill-Qwen-7B (The best balance for 16GB RAM Reasoning)
# We use the Q4_K_M quantization to fit in ~5GB RAM, leaving plenty of room for 32k context!
print("Downloading DeepSeek-R1 Reasoning model...")
model_path = hf_hub_download(
repo_id="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
filename="DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf",
token=os.getenv("HF_TOKEN")
)
# Initialize with 32k context window and 2 CPU threads
print("Loading model into RAM...")
llm = Llama(
model_path=model_path,
n_ctx=32768,
n_threads=2,
n_batch=512
)
@app.route("/")
def home():
return "M-Wellness Reasoning API is online."
@app.route("/v1/chat/completions", methods=["POST"])
def chat_completions():
# Compatible with OpenAI/Claude Code/Ollama API format
data = request.json
messages = data.get("messages", [])
stream = data.get("stream", False)
# Simple prompt formatting for DeepSeek-R1
prompt = ""
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
prompt += f"\n\n{role.capitalize()}: {content}"
prompt += "\n\nAssistant: <thought>\n"
if stream:
def generate():
for chunk in llm(prompt, max_tokens=4096, stream=True):
text = chunk["choices"][0]["text"]
yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n"
yield "data: [DONE]\n\n"
return Response(stream_with_context(generate()), mimetype="text/event-stream")
else:
output = llm(prompt, max_tokens=4096)
return json.dumps({
"choices": [{"message": {"role": "assistant", "content": output["choices"][0]["text"]}}]
})
if __name__ == "__main__":
app.run(host="0.0.0.0", port=7860)