Spaces:

AdarshJi
/

DDS

Sleeping

App Files Files Community

AdarshJi commited on 27 days ago

Commit

033253e

verified ·

1 Parent(s): 56447dc

Create server.py

Browse files

Files changed (1) hide show

server.py +835 -0

server.py ADDED Viewed

	@@ -0,0 +1,835 @@

+# server_sse.py
+import asyncio
+import inspect
+import logging
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, AsyncGenerator,Tuple
+import time
+import json
+import uuid
+import aiohttp
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import StreamingResponse, JSONResponse
+from curl_cffi.requests import Session
+# try fast json library
+try:
+    import orjson as _jsonlib  # type: ignore
+    def _loads(b: bytes):
+        return _jsonlib.loads(b)
+    def _dumps(obj) -> str:
+        # orjson.dumps returns bytes
+        return _jsonlib.dumps(obj).decode("utf-8")
+except Exception:
+    import json as _jsonlib  # type: ignore
+    def _loads(b: bytes):
+        return _jsonlib.loads(b)
+    def _dumps(obj) -> str:
+        return _jsonlib.dumps(obj)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("chat-server-sse")
+# preserve this global Requests (unchanged)
+Requests = Session(impersonate="chrome110")
+app = FastAPI()
+M2 = [
+    {
+    "tag": "@cf",
+    "model": "meta/llama-3.1-70b-instruct",
+    "max_tokens" : 8192
+  },
+  {
+    "tag": "@cf",
+    "model": "qwen/qwen2.5-coder-32b-instruct",
+    "max_tokens" : 8192
+  },
+  {
+    "tag": "@cf",
+    "model": "deepseek-ai/deepseek-r1-distill-qwen-32b",
+    "max_tokens" : 40960
+    # ok
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-4-scout-17b-16e-instruct",
+    "max_tokens" : 40960
+    # ok
+  },
+  {
+    "tag": "@cf",
+    "model": "google/gemma-3-12b-it",
+    "max_tokens" : 40960
+    # ok
+  },
+  {
+    "tag": "@cf",
+    "model": "mistralai/mistral-small-3.1-24b-instruct",
+    "max_tokens" : 40960
+    # ok
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-3.3-70b-instruct-fp8-fast",
+    "max_tokens" : 8192
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-3.2-3b-instruct",
+    "max_tokens" : 40960
+    # ok
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-3.2-1b-instruct",
+    "max_tokens" : 40960
+    # ok
+  },
+  {
+    "tag": "@hf",
+    "model": "meta-llama/meta-llama-3-8b-instruct",
+    "max_tokens" : 4391
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-3-8b-instruct",
+    "max_tokens" : 4391
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-2-7b-chat-int8",
+    "max_tokens" : 4391
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-2-7b-chat-fp16",
+    "max_tokens" : None
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-3-8b-instruct-awq",
+    "max_tokens" : 4391
+  },
+  {
+    "tag": "@hf",
+    "model": "meta-llama/meta-llama-3-8b-instruct",
+    "max_tokens" : 4391
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-3-8b-instruct",
+    "max_tokens" : 4391
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-2-7b-chat-int8",
+    "max_tokens" : 4391
+  },
+  {
+    "tag": "@cf",
+    "model": "meta/llama-3-8b-instruct-awq",
+    "max_tokens" : 4391
+  },
+  {
+    "tag": "@hf",
+    "model": "google/gemma-7b-it",
+    "max_tokens" : None
+  },
+  {
+    "tag": "@cf",
+    "model": "google/gemma-2b-it-lora",
+    "max_tokens" : 4391
+  },
+  {
+    "tag": "@hf",
+    "model": "mistral/mistral-7b-instruct-v0.2",
+    "max_tokens" : 8192
+  },
+  {
+    "tag": "@cf",
+    "model": "mistral/mistral-7b-instruct-v0.2-lora",
+    "max_tokens" : 8192
+  }
+]
+def FREEGPT(
+        RQ : Any,
+        messages : List[Dict],
+        model : str = "deepseek-ai/deepseek-r1-distill-qwen-32b",
+        max_token : int = 40960,
+        stream : bool = True,
+        timeout: Optional[float] = None
+):
+    md = next((item["tag"] + "/" + item["model"] for item in M2 if item["model"] == model), "@cf/meta/llama-3.2-1b-instruct")
+    URL = f"https://llmchat.in/inference/stream?model={md}"
+    headers = {
+        "Accept": "text/event-stream,*/*",
+        "Content-Type": "application/json",
+        "Origin": "https://llmchat.in",
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36",
+        "Cache-Control": "no-cache",
+        "Accept-Encoding": "identity",
+        "cf-ray" : "9cba9edd9f909aaf-SIN",
+    }
+    payload = {
+    "messages": messages,
+    "stream": stream,
+    **({"max_tokens": max_token} if max_token is not None else {}),
+    **({"max_tokens": next((item["max_tokens"] for item in M2 if item["model"] == model and item["max_tokens"] is not None), None)} if next((True for item in M2 if item["model"] == model and item["max_tokens"] is not None), None) else {})
+}
+    # print(payload)
+    try:
+        RESP = RQ.post(url=URL,json=payload , headers=headers , timeout=timeout,stream=stream)
+        print(RESP.status_code)
+    except:
+         return
+    if RESP.status_code == 200:
+        for raw in RESP.iter_lines():
+            if not raw:
+                continue
+            try:
+                line = raw.decode("utf-8", errors="replace").strip()
+            except Exception:
+                line = raw.decode("latin-1", errors="replace").strip()
+            if line.startswith("data:"):
+                    data_json = line.split('data: ')[1]
+            try:
+                data = json.loads(data_json)
+            except:
+                continue
+            try:
+                yield data["response"]
+            except: pass
+    else:
+        print(RESP.status_code)
+class CONV:
+    def __init__(self, default_system: str = ""):
+        self.default_system = default_system
+    @staticmethod
+    def _make_id() -> str:
+        return uuid.uuid4().hex[:20]
+    def alpaca_to_msg(
+        self,
+        alpaca_obj: Dict[str, Any],
+        insert_system: bool = True,
+        system_override: Optional[str] = None,
+        skip_empty: bool = True,
+    ) -> Tuple[List[Dict[str, str]], float]:
+        t0 = time.perf_counter()
+        out: List[Dict[str, str]] = []
+        sys_text = system_override if system_override is not None else self.default_system
+        if insert_system and sys_text is not None:
+            out.append({"role": "system", "content": sys_text})
+        msgs = alpaca_obj
+        append = out.append  # micro-optimization
+        for m in msgs:
+            role = (m.get("role") or "").strip().lower()
+            if role not in ("user", "assistant", "system"):
+                role = "user"
+            parts = m.get("parts") or []
+            # gather textual parts quickly
+            texts: List[str] = []
+            for p in parts:  # iterate in order
+                # only include parts with type == "text" and non-empty text
+                if isinstance(p, dict) and p.get("type") == "text":
+                    txt = p.get("text", "")
+                    if isinstance(txt, str) and txt:
+                        # keep as-is except trim trailing spaces/newlines
+                        texts.append(txt.rstrip())
+            if not texts and skip_empty:
+                continue
+            if texts:
+                content = "\n\n".join(texts)
+                append({"role": role, "content": content})
+            else:
+                # if not skipping empty, include empty content to preserve role
+                append({"role": role, "content": ""})
+        elapsed = time.perf_counter() - t0
+        return out, elapsed
+    def msg_to_alpaca(
+        self,
+        msg_list: List[Dict[str, Any]],
+        include_step_start: bool = True,
+        assistant_state_done: bool = True,
+        preserve_ids: bool = False,
+        skip_empty_text_parts: bool = False,
+    ) -> Tuple[Dict[str, List[Dict[str, Any]]], float]:
+        t0 = time.perf_counter()
+        out_messages: List[Dict[str, Any]] = []
+        append = out_messages.append
+        for entry in msg_list:
+            # allow both dicts and fallback strings
+            if not isinstance(entry, dict):
+                role = "user"
+                content = str(entry)
+                entry_id = None
+            else:
+                role = (entry.get("role") or "user").strip().lower()
+                content = entry.get("content", "")
+                entry_id = entry.get("id") if preserve_ids else None
+            if role not in ("user", "assistant"):
+                role = "user"
+            parts: List[Dict[str, Any]] = []
+            if role == "assistant" and include_step_start:
+                parts.append({"type": "step-start"})
+            # Only add the text part if it's non-empty (or skip_empty_text_parts False)
+            if isinstance(content, str):
+                if not skip_empty_text_parts or content.strip() != "":
+                    text_part: Dict[str, Any] = {"type": "text", "text": content}
+                    if role == "assistant" and assistant_state_done:
+                        text_part["state"] = "done"
+                    parts.append(text_part)
+            # Build message object
+            msg_obj: Dict[str, Any] = {
+                "id": entry_id if (entry_id is not None and isinstance(entry_id, str) and entry_id != "") else self._make_id(),
+                "role": role,
+                "parts": parts,
+                "metadata": {"custom": {}},
+            }
+            append(msg_obj)
+        elapsed = time.perf_counter() - t0
+        return out_messages, elapsed
+M1=[
+    "zai-org/glm-4.6",
+    "openai/gpt-5-nano-2025-08-07",
+    "deepseek-ai/deepseek-v3.2-thinking",
+    "nvidia/nvidia-nemotron-3-nano-30b-a3b",
+    "nvidia/nvidia-nemotron-3-nano-30b-a3b-thinking",
+    "openai/gpt-5-mini-2025-08-07",
+    "qwen/qwen3-vl-235b-a22b-thinking",
+    "qwen/qwen3-vl-235b-a22b-instruct",
+    "perplexity/sonar",
+    "moonshotai/kimi-k2.5",
+    "anthropic/claude-haiku-4-5-20251001",      #-----depcriating model
+    "google/gemini-2.5-flash-lite",
+    "moonshotai/kimi-k2-thinking"
+    "mistralai/devstral-2-123b-instruct-2512" #good mordal
+    "mistralai/mistral-large-3-675b-instruct-2512",
+    "openai/gpt-oss-safeguard-20b",
+    "openai/gpt-oss-120b"
+]
+def Adarsh_Personal(
+        RQ : Any,
+        messages : List[Dict],
+        model : str = "deepseek-ai/deepseek-r1-distill-qwen-32b",
+        max_token : int = 40960,
+        stream : bool = True,
+        timeout: Optional[float] = None
+):
+    RES=False
+    URL = "https://hadadxyz-ai.hf.space/api/mz1a85y5n80zy5127hgsba5f3a9c2d1Np0x300vcgduqxb7ep084fygd016c9a2d16fa8b3c41gut432pvjctr75hhspjae25d6f7a8b9c0d1e2pjf43v16f3a4b5c6dd7e8fba2bdx9a0b6dv1c2d7e2b4c9f83d6a4f1bb6c152f9pe3c7a88qv5d91f3c2b765g134bp9a41ne4yx4b3vda8w074"
+    NEW_MSGS , S = CONV().msg_to_alpaca(messages, include_step_start=True, assistant_state_done=True)
+    # print(NEW_MSGS)
+    payload = {
+    "tools": {},
+    "modelId": model,
+    "sessionId": "sess_7ef524b9_mlfe4ped",
+    "clientId": "7ef524b98a963b507ec9f4000fdea38c-mlfe4pea",
+    "requestId": "req_7ef524b9_mlfg1cpq_jjxb7p",
+    "clientIp": "122.161.52.54",
+    "realIp": "122.161.52.54",
+    "forwardedFor": "122.161.52.54",
+    "userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36",
+    "id": "DEFAULT_THREAD_ID",
+    "messages": NEW_MSGS,
+    "trigger": "submit-message",
+    "metadata": {}
+    }
+    headers = {
+    "Accept": "text/event-stream, */*",
+    "Content-Type": "application/json",
+    "Origin": "https://hadadxyz-ai.hf.space",
+    "User-Agent": payload["userAgent"],
+    "Cache-Control": "no-cache",
+    "Accept-Encoding": "identity",
+    "x-turnstile-token": "mlfe5357-zq9depfzhpb-e18cbvzrpid",
+    "x-turnstile-verified": "true",
+    }
+    RESP = RQ.post(URL, json=payload, headers=headers, stream=stream, timeout=timeout)
+    if RESP.status_code == 200:
+        for raw in RESP.iter_lines():
+            if not raw:
+                continue
+            try:
+                line = raw.decode("utf-8", errors="replace").strip()
+            except Exception:
+                line = raw.decode("latin-1", errors="replace").strip()
+            if line.startswith("data:"):
+                    data_json = line.split('data: ')[1]
+            try:
+                data = json.loads(data_json)
+            except:
+                continue
+            try:
+                 if data['type']=="reasoning-delta":
+                    if not RES:
+                        RES = True
+                        yield "<think>\n"
+                    try:
+                        yield data["delta"]
+                    except:
+                        pass
+            except :
+                pass
+            try:
+                if data["type"]=="text-delta":
+                    if  RES:
+                        RES = False
+                        yield "\n</think>\n"
+                    try:
+                        yield data["delta"]
+                    except:
+                        pass
+            except:
+                 pass
+M3 = ["qwen3-4b-thinking-2507"]
+def QWEN(
+        RQ : Any,
+        messages : List[Dict],
+        model : str = "NONE",
+        max_token : int = 40960,
+        stream : bool = True,
+        timeout: Optional[float] = None
+):
+    def GEN(RQ:any,messages:list,timeout:int=None):
+        API_URL = "https://teichai-qwen3-4b-thinking-2507-claude-4-5-opus.hf.space/api/chat"
+        payload = {
+            "messages":messages,
+            "searchEnabled":False
+            }
+        headers = {"Accept": "*/*","Content-Type": "application/json","Origin": "https://teichai-qwen3-4b-thinking-2507-claude-4-5-opus.hf.space","Referer": "https://teichai-qwen3-4b-thinking-2507-claude-4-5-opus.hf.space/","User-Agent": "python-requests/2.x"}
+        # c = t()
+        RESPO = RQ.post(API_URL, headers=headers, json=payload, stream=stream, timeout=timeout)
+        # print(c-t())
+        # print(RESPO)
+        buffer_lines = []
+        for raw in RESPO.iter_lines():
+            if raw is None:
+                continue
+            try:
+                line = raw.decode("utf-8", errors="replace").strip()
+            except Exception:
+                line = raw.decode("latin-1", errors="replace").strip()
+            if line == "":
+                if not buffer_lines:
+                    continue
+                data_text = "".join(buffer_lines)
+                buffer_lines = []
+                if data_text == "[DONE]":
+                    break
+                try:
+                    obj = json.loads(data_text)
+                    try:
+                        yield obj
+                    except:
+                        pass
+                except json.JSONDecodeError:
+                    pass
+                continue
+            if line.startswith("data:"):
+                buffer_lines.append(line[len("data:"):].lstrip())
+    RES = False
+    for i in GEN(RQ=RQ,message=messages,timeout=timeout):
+        if i["type"]=="reasoning":
+            if not RES:
+                RES = True
+                yield "<think>\n"
+            yield i["content"]
+        else:
+            if  RES:
+                RES = False
+                yield "\n</think>\n\n"
+            try:
+                yield i["content"]
+            except:
+                pass
+PROVIDERS: Dict[str, Dict[str, Any]] = {
+    "1": {"__func__": Adarsh_Personal, "models": M1},
+    "2": {"__func__": QWEN, "models": M2},
+    "3": {"__func__": FREEGPT, "models": M3},
+}
+# will be filled on startup to avoid per-request introspection
+PROVIDER_META: Dict[str, Dict[str, Any]] = {}
+class Config:
+    DEFAULT_PROVIDER = "1"
+    DEFAULT_MODEL = "llama-3.3-70b-versatile"
+    DEFAULT_MAX_TOKENS = 512
+    DEFAULT_TEMPERATURE = 0.7
+    TIMEOUT = 30.0
+    STREAM = True
+@dataclass
+class ChatRequest:
+    api_key: str
+    messages: List[Dict[str, Any]]
+    model: Optional[str] = None
+    provider: str = Config.DEFAULT_PROVIDER
+    max_tokens: int = Config.DEFAULT_MAX_TOKENS
+    temperature: float = Config.DEFAULT_TEMPERATURE
+    stream: bool = Config.STREAM
+    @staticmethod
+    def from_dict(payload: Dict[str, Any]) -> "ChatRequest":
+        api_key = payload.get("api_key") or payload.get("key") or payload.get("apikey")
+        messages = payload.get("messages") or payload.get("message") or payload.get("msgs")
+        model = payload.get("model_name") or payload.get("model")
+        provider = payload.get("provider") or Config.DEFAULT_PROVIDER
+        provider = str(provider)  # keep "1","2","3" style
+        max_tokens = payload.get("max_tokens", Config.DEFAULT_MAX_TOKENS)
+        temperature = payload.get("temperature", Config.DEFAULT_TEMPERATURE)
+        stream = payload.get("stream", Config.STREAM)
+        if messages is None:
+            messages = []
+        if isinstance(messages, dict):
+            messages = [messages]
+        return ChatRequest(
+            api_key=api_key,
+            messages=messages,
+            model=model,
+            provider=provider,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            stream=stream,
+        )
+GLOBAL_AIOHTTP: Optional[aiohttp.ClientSession] = None
+@app.on_event("startup")
+async def on_startup():
+    global GLOBAL_AIOHTTP, PROVIDER_META
+    logger.info("Starting up - creating global aiohttp session and analyzing providers")
+    GLOBAL_AIOHTTP = aiohttp.ClientSession()
+    for key, payload in PROVIDERS.items():
+        func = payload["__func__"]
+        meta = {
+            "func": func,
+            "is_async_gen_fn": inspect.isasyncgenfunction(func),
+            "is_coroutine_fn": inspect.iscoroutinefunction(func),
+            "is_generator_fn": inspect.isgeneratorfunction(func),
+            # mark as sync if not coroutine/asyncgen/generator
+            "is_sync": not (inspect.iscoroutinefunction(func) or inspect.isasyncgenfunction(func) or inspect.isgeneratorfunction(func)),
+        }
+        PROVIDER_META[key] = meta
+    logger.info("Provider metadata prepared")
+@app.on_event("shutdown")
+async def on_shutdown():
+    global GLOBAL_AIOHTTP
+    logger.info("Shutting down - closing global aiohttp session")
+    if GLOBAL_AIOHTTP and not GLOBAL_AIOHTTP.closed:
+        await GLOBAL_AIOHTTP.close()
+async def _call_provider_and_iterate(
+    provider_key: str,
+    messages: List[Dict],
+    model: str,
+    max_token: int,
+    stream_flag: bool,
+    timeout: float,
+) -> AsyncGenerator[bytes, None]:
+    """
+    Invoke provider according to metadata and yield raw bytes.
+    We'll transform these bytes into SSE events higher up.
+    """
+    if provider_key not in PROVIDER_META:
+        raise ValueError(f"Unknown provider '{provider_key}'")
+    meta = PROVIDER_META[provider_key]
+    func = meta["func"]
+    async def _invoke_async():
+        return func(Requests, Message=messages, Model=model, max_token=max_token, stream=stream_flag, timeout=timeout)
+    try:
+        provider_task = _invoke_async()
+        # async generator function
+        if meta["is_async_gen_fn"]:
+            agen = await asyncio.wait_for(provider_task, timeout=timeout)
+            async for item in agen:
+                if item is None:
+                    continue
+                if isinstance(item, bytes):
+                    yield item
+                elif isinstance(item, str):
+                    yield item.encode("utf-8")
+                else:
+                    yield str(item).encode("utf-8")
+            return
+        # coroutine function
+        if meta["is_coroutine_fn"]:
+            res = await asyncio.wait_for(provider_task, timeout=timeout)
+            if res is None:
+                return
+            # list/tuple
+            if isinstance(res, (list, tuple)):
+                for item in res:
+                    if item is None:
+                        continue
+                    if isinstance(item, bytes):
+                        yield item
+                    elif isinstance(item, str):
+                        yield item.encode("utf-8")
+                    else:
+                        yield str(item).encode("utf-8")
+                return
+            # sync-iterable
+            if inspect.isgenerator(res) or (hasattr(res, "__iter__") and not isinstance(res, (str, bytes, dict))):
+                for item in res:
+                    if item is None:
+                        continue
+                    if isinstance(item, bytes):
+                        yield item
+                    elif isinstance(item, str):
+                        yield item.encode("utf-8")
+                    else:
+                        yield str(item).encode("utf-8")
+                return
+            # single value
+            if isinstance(res, bytes):
+                yield res
+            elif isinstance(res, str):
+                yield res.encode("utf-8")
+            else:
+                yield str(res).encode("utf-8")
+            return
+        # sync function/generator: run in thread
+        sync_res = await asyncio.wait_for(
+            asyncio.to_thread(func, Requests, messages, model, max_token, stream_flag, timeout),
+            timeout=timeout,
+        )
+        if sync_res is None:
+            return
+        if isinstance(sync_res, (list, tuple)):
+            for item in sync_res:
+                if item is None:
+                    continue
+                if isinstance(item, bytes):
+                    yield item
+                elif isinstance(item, str):
+                    yield item.encode("utf-8")
+                else:
+                    yield str(item).encode("utf-8")
+            return
+        if inspect.isgenerator(sync_res) or (hasattr(sync_res, "__iter__") and not isinstance(sync_res, (str, bytes, dict))):
+            for item in sync_res:
+                if item is None:
+                    continue
+                if isinstance(item, bytes):
+                    yield item
+                elif isinstance(item, str):
+                    yield item.encode("utf-8")
+                else:
+                    yield str(item).encode("utf-8")
+            return
+        if isinstance(sync_res, bytes):
+            yield sync_res
+        elif isinstance(sync_res, str):
+            yield sync_res.encode("utf-8")
+        else:
+            yield str(sync_res).encode("utf-8")
+    except asyncio.TimeoutError:
+        err = f"[server_timeout] provider {provider_key} exceeded {timeout}s\n"
+        logger.warning(err.strip())
+        yield err.encode("utf-8")
+    except Exception as e:
+        logger.exception("Provider error")
+        err = f"[server_error] {type(e).__name__}: {e}\n"
+        yield err.encode("utf-8")
+@app.post("/chat")
+async def chat_endpoint(request: Request):
+    try:
+        body_bytes = await request.body()
+        payload = _loads(body_bytes)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"invalid json: {e}")
+    req = ChatRequest.from_dict(payload)
+    if not req.api_key or not req.messages:
+        raise HTTPException(status_code=400, detail="api_key and messages required")
+    provider_key = req.provider
+    if req.stream:
+        async def sse_stream_gen():
+            """
+            For every chunk from provider, send an SSE event line:
+            data: {"response":"..."}\n\n
+            After completion send a final line: [DONE]\n
+            """
+            # iterate provider outputs (raw bytes)
+            async for raw_chunk in _call_provider_and_iterate(
+                provider_key=provider_key,
+                messages=req.messages,
+                model=req.model or Config.DEFAULT_MODEL,
+                max_token=req.max_tokens,
+                stream_flag=req.stream,
+                timeout=Config.TIMEOUT,
+            ):
+                # decode provider chunk to text
+                if isinstance(raw_chunk, bytes):
+                    text = raw_chunk.decode("utf-8", errors="ignore")
+                else:
+                    text = str(raw_chunk)
+                # build the JSON payload {"response": "<text>"} and serialize
+                payload_obj = {"response": text}
+                try:
+                    json_str = _dumps(payload_obj)
+                except Exception:
+                    # fallback to manual safe-escape for string-only payload
+                    import json as _json_fallback
+                    json_str = _json_fallback.dumps(payload_obj)
+                # SSE data line + double newline (SSE event terminator)
+                sse_event = f"data: {json_str}\n\n"
+                yield sse_event.encode("utf-8")
+            # final termination marker exactly as you requested
+            # NOTE: sending it as a line by itself (not prefixed by 'data:')
+            yield ("[DONE]\n").encode("utf-8")
+        # content-type text/event-stream (SSE)
+        return StreamingResponse(sse_stream_gen(), media_type="text/event-stream")
+    else:
+        # non-stream: collect and return JSON (same as before)
+        collected = []
+        async for chunk in _call_provider_and_iterate(
+            provider_key=provider_key,
+            messages=req.messages,
+            model=req.model or Config.DEFAULT_MODEL,
+            max_token=req.max_tokens,
+            stream_flag=req.stream,
+            timeout=Config.TIMEOUT,
+        ):
+            if isinstance(chunk, bytes):
+                collected.append(chunk.decode("utf-8", errors="ignore"))
+            else:
+                collected.append(str(chunk))
+        full_text = "".join(collected)
+        return JSONResponse({"text": full_text})
+@app.get("/model")
+async def model():
+    models = [M1, M2, M3]
+    return {"models": models}
+@app.get("/health")
+async def health_check():
+    return {"status": "ok"}