Spaces:

mikkhan
/

m-wellness-reasoning-api

Build error

App Files Files Community

m-wellness-reasoning-api / server.py

mikkhan

Upload server.py with huggingface_hub

fa3e5a5 verified 8 days ago

raw

history blame contribute delete

2.09 kB

	# M-Wellness Reasoning API v1.0.1
	import os
	import json
	from flask import Flask, request, Response, stream_with_context
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	app = Flask(__name__)

	# Model: DeepSeek-R1-Distill-Qwen-7B (The best balance for 16GB RAM Reasoning)
	# We use the Q4_K_M quantization to fit in ~5GB RAM, leaving plenty of room for 32k context!
	print("Downloading DeepSeek-R1 Reasoning model...")
	model_path = hf_hub_download(
	repo_id="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF",
	filename="DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf",
	token=os.getenv("HF_TOKEN")
	)

	# Initialize with 32k context window and 2 CPU threads
	print("Loading model into RAM...")
	llm = Llama(
	model_path=model_path,
	n_ctx=32768,
	n_threads=2,
	n_batch=512
	)

	@app.route("/")
	def home():
	return "M-Wellness Reasoning API is online."

	@app.route("/v1/chat/completions", methods=["POST"])
	def chat_completions():
	# Compatible with OpenAI/Claude Code/Ollama API format
	data = request.json
	messages = data.get("messages", [])
	stream = data.get("stream", False)

	# Simple prompt formatting for DeepSeek-R1
	prompt = ""
	for msg in messages:
	role = msg.get("role", "user")
	content = msg.get("content", "")
	prompt += f"\n\n{role.capitalize()}: {content}"
	prompt += "\n\nAssistant: <thought>\n"

	if stream:
	def generate():
	for chunk in llm(prompt, max_tokens=4096, stream=True):
	text = chunk["choices"][0]["text"]
	yield f"data: {json.dumps({'choices': [{'delta': {'content': text}}]})}\n\n"
	yield "data: [DONE]\n\n"
	return Response(stream_with_context(generate()), mimetype="text/event-stream")
	else:
	output = llm(prompt, max_tokens=4096)
	return json.dumps({
	"choices": [{"message": {"role": "assistant", "content": output["choices"][0]["text"]}}]
	})

	if __name__ == "__main__":
	app.run(host="0.0.0.0", port=7860)