# M-Wellness Reasoning API v1.0.1 import os import json from flask import Flask, request, Response, stream_with_context from huggingface_hub import hf_hub_download from llama_cpp import Llama app = Flask(__name__) # Model: DeepSeek-R1-Distill-Qwen-7B (The best balance for 16GB RAM Reasoning) # We use the Q4_K_M quantization to fit in ~5GB RAM, leaving plenty of room for 32k context! print("Downloading DeepSeek-R1 Reasoning model...") model_path = hf_hub_download( repo_id="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF", filename="DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf", token=os.getenv("HF_TOKEN") ) # Initialize with 32k context window and 2 CPU threads print("Loading model into RAM...") llm = Llama( model_path=model_path, n_ctx=32768, n_threads=2, n_batch=512 ) @app.route("/") def home(): return "M-Wellness Reasoning API is online." @app.route("/v1/chat/completions", methods=["POST"]) def chat_completions(): # Compatible with OpenAI/Claude Code/Ollama API format data = request.json messages = data.get("messages", []) stream = data.get("stream", False) # Simple prompt formatting for DeepSeek-R1 prompt = "" for msg in messages: role = msg.get("role", "user") content = msg.get("content", "") prompt += f"\n\n{role.capitalize()}: {content}" prompt += "\n\nAssistant: \n" if stream: def generate(): for chunk in llm(prompt, max_tokens=4096, stream=True): text = chunk["choices"][0]["text"] yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n" yield "data: [DONE]\n\n" return Response(stream_with_context(generate()), mimetype="text/event-stream") else: output = llm(prompt, max_tokens=4096) return json.dumps({ "choices": [{"message": {"role": "assistant", "content": output["choices"][0]["text"]}}] }) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)