File size: 2,090 Bytes
fa3e5a5
7d3ccee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# M-Wellness Reasoning API v1.0.1
import os
import json
from flask import Flask, request, Response, stream_with_context
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

app = Flask(__name__)

# Model: DeepSeek-R1-Distill-Qwen-7B (The best balance for 16GB RAM Reasoning)
# We use the Q4_K_M quantization to fit in ~5GB RAM, leaving plenty of room for 32k context!
print("Downloading DeepSeek-R1 Reasoning model...")
model_path = hf_hub_download(
    repo_id="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
    filename="DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf",
    token=os.getenv("HF_TOKEN")
)

# Initialize with 32k context window and 2 CPU threads
print("Loading model into RAM...")
llm = Llama(
    model_path=model_path,
    n_ctx=32768, 
    n_threads=2,
    n_batch=512
)

@app.route("/")
def home():
    return "M-Wellness Reasoning API is online."

@app.route("/v1/chat/completions", methods=["POST"])
def chat_completions():
    # Compatible with OpenAI/Claude Code/Ollama API format
    data = request.json
    messages = data.get("messages", [])
    stream = data.get("stream", False)
    
    # Simple prompt formatting for DeepSeek-R1
    prompt = ""
    for msg in messages:
        role = msg.get("role", "user")
        content = msg.get("content", "")
        prompt += f"\n\n{role.capitalize()}: {content}"
    prompt += "\n\nAssistant: <thought>\n"

    if stream:
        def generate():
            for chunk in llm(prompt, max_tokens=4096, stream=True):
                text = chunk["choices"][0]["text"]
                yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n"
            yield "data: [DONE]\n\n"
        return Response(stream_with_context(generate()), mimetype="text/event-stream")
    else:
        output = llm(prompt, max_tokens=4096)
        return json.dumps({
            "choices": [{"message": {"role": "assistant", "content": output["choices"][0]["text"]}}]
        })

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=7860)