from dataclasses import dataclass from typing import List, Dict, Any, AsyncGenerator, Optional import re import orjson import httpx import json from fastapi import FastAPI, Request, HTTPException from fastapi.responses import StreamingResponse def get_models(): mord = { "Providers" : ["1","2" ,"3","4","5"], "Models" : { "1" : [ { "id": "openai/gpt-oss-120b", "owned_by": "OpenAI" }, { "id": "moonshotai/kimi-k2-instruct", "owned_by": "Moonshot AI" }, { "id": "canopylabs/orpheus-v1-english", "owned_by": "Canopy Labs" }, { "id": "llama-3.1-8b-instant", "owned_by": "Meta" }, { "id": "whisper-large-v3", "owned_by": "OpenAI" }, { "id": "meta-llama/llama-4-scout-17b-16e-instruct", "owned_by": "Meta" }, { "id": "allam-2-7b", "owned_by": "SDAIA" }, { "id": "groq/compound", "owned_by": "Groq" }, { "id": "canopylabs/orpheus-arabic-saudi", "owned_by": "Canopy Labs" }, { "id": "llama-3.3-70b-versatile", "owned_by": "Meta" }, { "id": "qwen/qwen3-32b", "owned_by": "Alibaba Cloud" }, { "id": "meta-llama/llama-prompt-guard-2-22m", "owned_by": "Meta" }, { "id": "groq/compound-mini", "owned_by": "Groq" }, { "id": "meta-llama/llama-guard-4-12b", "owned_by": "Meta" }, { "id": "openai/gpt-oss-20b", "owned_by": "OpenAI" }, { "id": "openai/gpt-oss-safeguard-20b", "owned_by": "OpenAI" }, { "id": "meta-llama/llama-4-maverick-17b-128e-instruct", "owned_by": "Meta" }, { "id": "moonshotai/kimi-k2-instruct-0905", "owned_by": "Moonshot AI" } ], "2" : [ { "id": "aisingapore/gemma-sea-lion-v4-27b-it", "owned_by": "AI Singapore" }, { "id": "defog/sqlcoder-7b-2", "owned_by": "Defog" }, { "id": "ibm-granite/granite-4.0-h-micro", "owned_by": "IBM" }, { "id": "meta/llama-3.1-8b-instruct", "owned_by": "Meta" }, { "id": "microsoft/phi-2", "owned_by": "Microsoft" }, { "id": "qwen/qwen3-30b-a3b-fp8", "owned_by": "Alibaba Cloud" }, { "id": "qwen/qwq-32b", "owned_by": "Alibaba Cloud" } ], "3" : [ { "id": "zai-org/glm-4.6", "owned_by": "Zhipu AI" }, { "id": "openai/gpt-5-nano-2025-08-07", "owned_by": "OpenAI" }, { "id": "deepseek-ai/deepseek-v3.2-thinking", "owned_by": "DeepSeek AI" }, { "id": "nvidia/nvidia-nemotron-3-nano-30b-a3b", "owned_by": "NVIDIA" }, { "id": "nvidia/nvidia-nemotron-3-nano-30b-a3b-thinking", "owned_by": "NVIDIA" }, { "id": "openai/gpt-5-mini-2025-08-07", "owned_by": "OpenAI" }, { "id": "qwen/qwen3-vl-235b-a22b-thinking", "owned_by": "Alibaba Cloud" }, { "id": "qwen/qwen3-vl-235b-a22b-instruct", "owned_by": "Alibaba Cloud" }, { "id": "perplexity/sonar", "owned_by": "Perplexity" }, { "id": "moonshotai/kimi-k2.5", "owned_by": "Moonshot AI" }, { "id": "anthropic/claude-haiku-4-5-20251001", "owned_by": "Anthropic" }, { "id": "google/gemini-2.5-flash-lite", "owned_by": "Google" }, { "id": "moonshotai/kimi-k2-thinking", "owned_by": "Moonshot AI" }, { "id": "mistralai/devstral-2-123b-instruct-2512", "owned_by": "Mistral AI" }, { "id": "mistralai/mistral-large-3-675b-instruct-2512", "owned_by": "Mistral AI" }, { "id": "openai/gpt-oss-safeguard-20b", "owned_by": "OpenAI" }, { "id": "openai/gpt-oss-120b", "owned_by": "OpenAI" } ], "4" : [ { "id": "qwen3-4b-thinking-2507", "owned_by": "Alibaba Cloud" } ], "5" : [ { "id": "meta/llama-3.1-70b-instruct", "owned_by": "Meta" }, { "id": "qwen/qwen2.5-coder-32b-instruct", "owned_by": "Alibaba Cloud" }, { "id": "deepseek-ai/deepseek-r1-distill-qwen-32b", "owned_by": "DeepSeek AI" }, { "id": "meta/llama-4-scout-17b-16e-instruct", "owned_by": "Meta" }, { "id": "google/gemma-3-12b-it", "owned_by": "Google" }, { "id": "mistralai/mistral-small-3.1-24b-instruct", "owned_by": "Mistral AI" }, { "id": "meta/llama-3.3-70b-instruct-fp8-fast", "owned_by": "Meta" }, { "id": "meta/llama-3.2-3b-instruct", "owned_by": "Meta" }, { "id": "meta/llama-3.2-1b-instruct", "owned_by": "Meta" }, { "id": "meta-llama/meta-llama-3-8b-instruct", "owned_by": "Meta" }, { "id": "meta/llama-3-8b-instruct", "owned_by": "Meta" }, { "id": "meta/llama-2-7b-chat-int8", "owned_by": "Meta" }, { "id": "meta/llama-2-7b-chat-fp16", "owned_by": "Meta" }, { "id": "meta/llama-3-8b-instruct-awq", "owned_by": "Meta" }, { "id": "meta-llama/meta-llama-3-8b-instruct", "owned_by": "Meta" }, { "id": "meta/llama-3-8b-instruct", "owned_by": "Meta" }, { "id": "meta/llama-2-7b-chat-int8", "owned_by": "Meta" }, { "id": "meta/llama-3-8b-instruct-awq", "owned_by": "Meta" }, { "id": "google/gemma-7b-it", "owned_by": "Google" }, { "id": "google/gemma-2b-it-lora", "owned_by": "Google" }, { "id": "mistral/mistral-7b-instruct-v0.2", "owned_by": "Mistral AI" }, { "id": "mistral/mistral-7b-instruct-v0.2-lora", "owned_by": "Mistral AI" } ] } } return mord try: MODEL_NAMES = get_models() except Exception: MODEL_NAMES = {"GROQ": "GROQ-FALLBACK", "LLMC": "LLMC-FALLBACK"} class Config: DEFAULT_PROVIDER = "1" DEFAULT_MODEL = "llama-3.3-70b-versatile" DEFAULT_TEMPERATURE = 0.7 CHUNK_SIZE = 1000 MAX_CONNECTIONS = 200 HTTP2 = True TIMEOUT = 30.0 STREAM_BATCH_BYTES = 0 PROVIDERS: Dict[str, Dict[str, Any]] = { "1": { "AUTH": True, "BASE_URL": "https://api.groq.com/openai/v1/chat/completions", "DEFAULT_MODEL": "qwen/qwen3-32b", "HEADERS": {"Authorization": "Bearer {API}", "Content-Type": "application/json"}, "PAYLOAD": { "model": "{model}", "messages": "{messages}", "temperature": "{temperature}", "stop": None, "stream": "{stream}", }, }, "2": { "AUTH": False, "BASE_URL": "https://llmchat.in/inference/stream?model={model}", "DEFAULT_MODEL": "@cf/meta/llama-3.1-8b-instruct", "HEADERS": { "Content-Type": "application/json", "Accept": "*/*", "Origin": "https://llmchat.in", "Referer": "https://llmchat.in/", }, "PAYLOAD": {"messages": "{messages}", "stream": "{stream}"}, }, "3": { "AUTH": False, "BASE_URL": "https://adarshji-md.hf.space/gen", "DEFAULT_MODEL": "openai/gpt-oss-120b", "PAYLOAD": {"api_key": "LOL", "provider": "1","messages": "{messages}","model" : "{model}","stream": "{stream}"}, }, "4": { "AUTH": False, "BASE_URL": "https://adarshji-md.hf.space/gen", "DEFAULT_MODEL": "qwen3-4b-thinking-2507", "PAYLOAD": {"api_key": "LOL", "provider": "2","messages": "{messages}","model" : "{model}","stream": "{stream}"}, }, "5": { "AUTH": False, "BASE_URL": "https://adarshji-md.hf.space/gen", "DEFAULT_MODEL": "deepseek-ai/deepseek-r1-distill-qwen-32b", "PAYLOAD": {"api_key": "LOL", "provider": "3","messages": "{messages}","model" : "{model}","stream": "{stream}"}, }, } _placeholder_re = re.compile(r"\{(.*?)\}") def apply_values_to_template(template: Any, values: Dict[str, Any]) -> Any: if isinstance(template, str): m = _placeholder_re.fullmatch(template.strip()) if m: return values.get(m.group(1), template) str_values = { k: (v if isinstance(v, str) else (orjson.dumps(v).decode("utf-8") if not isinstance(v, (int, float, bool, type(None))) else v)) for k, v in values.items() } try: return template.format(**str_values) except Exception: return template if isinstance(template, dict): return {k: apply_values_to_template(v, values) for k, v in template.items()} if isinstance(template, list): return [apply_values_to_template(i, values) for i in template] return template def build_values_from_request(req: "ChatRequest") -> Dict[str, Any]: return { "api_key": req.api_key, "API": req.api_key, "messages": req.messages, "message": req.messages, "model": req.model or None, "temperature": req.temperature, "stream": req.stream, } @dataclass class ChatRequest: api_key: str messages: List[Dict[str, Any]] model: Optional[str] = None provider: str = Config.DEFAULT_PROVIDER temperature: float = Config.DEFAULT_TEMPERATURE stream: bool = True @staticmethod def from_dict(payload: Dict[str, Any]) -> "ChatRequest": api_key = payload.get("api_key") or payload.get("key") or payload.get("apikey") messages = payload.get("messages") or payload.get("message") or payload.get("msgs") model = payload.get("model_name") or payload.get("model") provider = (payload.get("provider") or Config.DEFAULT_PROVIDER).upper() temperature = payload.get("temperature", Config.DEFAULT_TEMPERATURE) stream = payload.get("stream", True) if messages is None: messages = [] if isinstance(messages, dict): messages = [messages] return ChatRequest(api_key=api_key, messages=messages, model=model, provider=provider, temperature=temperature, stream=stream) class AsyncUpstreamClient: def __init__(self): limits = httpx.Limits(max_connections=Config.MAX_CONNECTIONS) self._client = httpx.AsyncClient(timeout=Config.TIMEOUT, limits=limits, http2=Config.HTTP2) def _prepare_headers(self, headers_template: Dict[str, str], values: Dict[str, Any]) -> Dict[str, str]: headers = {} for k, v in headers_template.items(): f = apply_values_to_template(v, values) if f is None: continue headers[k] = f if isinstance(f, str) else str(f) return headers async def close(self): await self._client.aclose() async def post_json(self, url: str, headers: Dict[str, str], payload: Any) -> Dict[str, Any]: resp = await self._client.post(url, headers=headers, json=payload) resp.raise_for_status() return resp.json() def _is_metadata_blob(self, obj: Dict[str, Any]) -> bool: if not isinstance(obj, dict): return False if ("id" in obj and "object" in obj) or "x_groq" in obj or "tool_calls" in obj or ("usage" in obj and isinstance(obj.get("usage"), dict)): return True if obj.get("choices") and isinstance(obj.get("choices"), list): try: c0 = obj["choices"][0] delta = c0.get("delta", {}) if isinstance(c0, dict) else {} content = delta.get("content") or (c0.get("message", {}) or {}).get("content") if not content: return True except Exception: return False return False async def stream_post(self, url: str, headers: Dict[str, str], payload: Any) -> AsyncGenerator[bytes, None]: async with self._client.stream("POST", url, headers=headers, json=payload) as resp: resp.raise_for_status() buf = b"" RES = False async for chunk in resp.aiter_bytes(chunk_size=Config.CHUNK_SIZE): if not chunk: continue buf += chunk while b"\n\n" in buf: event, buf = buf.split(b"\n\n", 1) for lines in event.splitlines(): if not lines: continue line = lines.decode('utf-8') try: data_json = line.split('data: ')[1] except: pass print("ERROR0") # print(line) try: data = json.loads(data_json) except: if data_json == "[DONE]": continue else: print("ERROR1") pass # print(data_json) # print(len(data_json)) try: if data['choices'][0]['delta']['reasoning']: if not RES: RES = True yield orjson.dumps({"response": "\n"}) + b"\n" yield orjson.dumps({"response": data['choices'][0]['delta']['reasoning']}) + b"\n" except: try: try: yield orjson.dumps({"response": data["response"]}) + b"\n" except: if RES: RES = False yield orjson.dumps({"response": "\n\n"}) + b"\n" yield orjson.dumps({"response": data['choices'][0]['delta']['content']}) + b"\n" except: pass # print("ERROR2") # print(data) # yield orjson.dumps({"response": "okk\n"}) + b"\n" class ChatService: def __init__(self, client: Optional[AsyncUpstreamClient] = None): self.client = client or AsyncUpstreamClient() def _get_provider_config(self, provider_name: str) -> Dict[str, Any]: return PROVIDERS.get(provider_name.upper(), PROVIDERS.get(Config.DEFAULT_PROVIDER, {})) def build_request_for_provider(self, req: ChatRequest) -> Dict[str, Any]: prov = self._get_provider_config(req.provider) values = build_values_from_request(req) if not values.get("model"): values["model"] = prov.get("DEFAULT_MODEL") or Config.DEFAULT_MODEL url = apply_values_to_template(prov.get("BASE_URL", ""), values) headers = self.client._prepare_headers(prov.get("HEADERS", {}), values) payload = apply_values_to_template(prov.get("PAYLOAD", {}), values) return {"url": url, "headers": headers, "payload": payload} async def generate(self, req: ChatRequest) -> str: data = self.build_request_for_provider(req) result = await self.client.post_json(data["url"], data["headers"], data["payload"]) try: return result["choices"][0]["message"]["content"] except Exception: if isinstance(result, dict) and "response" in result: return result["response"] return orjson.dumps(result).decode("utf-8") async def generate_stream(self, req: ChatRequest) -> AsyncGenerator[bytes, None]: data = self.build_request_for_provider(req) async for token_bytes in self.client.stream_post(data["url"], data["headers"], data["payload"]): yield token_bytes app = FastAPI(title="High-speed Chat Proxy") service = ChatService() @app.on_event("shutdown") async def shutdown_event(): try: await service.client.close() except Exception: pass @app.post("/v1/chat/completions") async def completions(request: Request): body = await request.json() req = ChatRequest.from_dict(body) if not req.api_key or not req.messages: raise HTTPException(status_code=400, detail="api_key and messages required") async def streamer(): if req.stream: buf = bytearray() threshold = Config.STREAM_BATCH_BYTES async for chunk_bytes in service.generate_stream(req): if not chunk_bytes: continue buf.extend(chunk_bytes) if len(buf) >= threshold: yield b"data: " + bytes(buf) buf.clear() if buf: yield b"data: " + bytes(buf) yield b"data: [DONE]\n\n" else: text = await service.generate(req) yield orjson.dumps({"response": text}) + b"\n" return StreamingResponse(streamer(), media_type="application/x-ndjson", headers={"Cache-Control": "no-cache"}) @app.get("/v1/models") async def models(): return {"models": MODEL_NAMES} @app.get("/") async def root(): return {"service": "High-speed Chat Proxy", "status": "running"}