import os import json from openai import OpenAI from gradio import Server from fastapi.responses import HTMLResponse, StreamingResponse from fastapi import Request # ── OpenAI-compatible client pointing at HF Router ────────────────────────── client = OpenAI( base_url="https://router.huggingface.co/v1", api_key=os.environ.get("HF_TOKEN", ""), default_headers={"X-HF-Bill-To": "huggingface"}, ) MODEL = "MiniMaxAI/MiniMax-M3:novita" app = Server() # ── Serve the custom frontend ──────────────────────────────────────────────── @app.get("/") async def homepage(): html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "index.html") with open(html_path, "r", encoding="utf-8") as f: content = f.read() return HTMLResponse(content=content) # ── Streaming chat endpoint (SSE) ──────────────────────────────────────────── @app.post("/chat") async def chat_stream(request: Request): """ Accepts JSON body: { "messages": [ { "role": "user", "content": "..." } ← text-only { "role": "user", "content": [ ← multimodal {"type": "text", "text": "..."}, {"type": "image_url", "image_url": {"url": "..."}} ] } ] } Returns an SSE stream of partial tokens. """ body = await request.json() messages = body.get("messages", []) async def generate(): try: stream = client.chat.completions.create( model=MODEL, messages=messages, stream=True, ) for chunk in stream: delta = chunk.choices[0].delta content = delta.content if content: payload = json.dumps({"token": content}) yield f"data: {payload}\n\n" yield "data: [DONE]\n\n" except Exception as e: yield f"data: {json.dumps({'error': str(e)})}\n\n" yield "data: [DONE]\n\n" return StreamingResponse( generate(), media_type="text/event-stream", headers={ "Cache-Control": "no-cache", "X-Accel-Buffering": "no", }, ) app.launch(show_error=True)