File size: 3,914 Bytes

# Minimal OpenAI-compatible server for MiroThinker using plain transformers.
# No vLLM/SGLang needed. Works wherever PyTorch CUDA works (incl. Windows).
#
#   pip install transformers accelerate fastapi uvicorn torch
#   python mirothinker_server.py
#
# Then set in miroflow-agent .env / config:
#   base_url: http://localhost:61005/v1   (provider "qwen", any api_key)
#
# Handles the two non-standard params miroflow-agent sends via extra_body:
#   - repetition_penalty
#   - continue_final_message / add_generation_prompt (resume truncated output)

import time
import uuid

import torch
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_ID = "miromind-ai/MiroThinker-v1.5-30B"  # change if using 1.7 etc.
PORT = 61005

print(f"Loading {MODEL_ID} ... (this takes a while)")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",          # splits layers across both A100s
    trust_remote_code=True,
)
model.eval()
print("Model loaded. Device map:", model.hf_device_map if hasattr(model, "hf_device_map") else "n/a")

app = FastAPI()


class ChatRequest(BaseModel):
    model: str = MODEL_ID
    messages: list
    max_tokens: int | None = 16384
    max_completion_tokens: int | None = None
    temperature: float = 1.0
    top_p: float = 0.95
    stream: bool = False
    # extra_body params arrive as top-level fields with the openai SDK
    repetition_penalty: float | None = None
    continue_final_message: bool | None = None
    add_generation_prompt: bool | None = None

    class Config:
        extra = "allow"


@app.get("/health")
def health():
    return {"status": "ok"}


@app.get("/v1/models")
def models():
    return {"object": "list", "data": [{"id": MODEL_ID, "object": "model"}]}


@app.post("/v1/chat/completions")
def chat(req: ChatRequest):
    continue_final = bool(req.continue_final_message)

    # Build the prompt with the model's own chat template.
    # return_dict=True gives a BatchEncoding (input_ids + attention_mask);
    # handle it as a dict so .shape / generate work correctly.
    enc = tokenizer.apply_chat_template(
        req.messages,
        add_generation_prompt=not continue_final,
        continue_final_message=continue_final,
        return_tensors="pt",
        return_dict=True,
    )
    enc = {k: v.to(model.device) for k, v in enc.items()}
    input_ids = enc["input_ids"]

    prompt_tokens = input_ids.shape[-1]
    max_new = req.max_completion_tokens or req.max_tokens or 16384

    gen_kwargs = dict(
        max_new_tokens=max_new,
        do_sample=req.temperature > 0,
        temperature=max(req.temperature, 1e-5),
        top_p=req.top_p,
        pad_token_id=tokenizer.eos_token_id,
    )
    if req.repetition_penalty and req.repetition_penalty != 1.0:
        gen_kwargs["repetition_penalty"] = req.repetition_penalty

    with torch.inference_mode():
        out = model.generate(**enc, **gen_kwargs)

    new_tokens = out[0][prompt_tokens:]
    text = tokenizer.decode(new_tokens, skip_special_tokens=True)
    completion_tokens = new_tokens.shape[-1]
    finish = "length" if completion_tokens >= max_new else "stop"

    return {
        "id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
        "object": "chat.completion",
        "created": int(time.time()),
        "model": req.model,
        "choices": [{
            "index": 0,
            "message": {"role": "assistant", "content": text},
            "finish_reason": finish,
        }],
        "usage": {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens,
            "total_tokens": prompt_tokens + completion_tokens,
        },
    }


if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=PORT)