Spaces:

AdarshJi
/

FALTU_ADARSH

Runtime error

App Files Files Community

AdarshJi commited on about 1 month ago

Commit

baf1e81

verified ·

1 Parent(s): 16d6929

Update server.py

Browse files

Files changed (1) hide show

server.py +74 -162

server.py CHANGED Viewed

@@ -3,16 +3,10 @@ from typing import List, Dict, Any, AsyncGenerator, Optional
 import re
 import orjson
 import httpx
-import logging
-import asyncio
 from fastapi import FastAPI, Request, HTTPException
-from fastapi.responses import StreamingResponse, JSONResponse
-# -------------------------
-# Logging
-# -------------------------
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
-logger = logging.getLogger("chat-proxy")
@@ -284,48 +278,55 @@ def LLMCM() -> list:
     return R
 try:
-    MODEL_NAMES = {"GROQ": GROQM(), "LLMC": LLMCM()}
 except Exception:
     MODEL_NAMES = {"GROQ": "GROQ-FALLBACK", "LLMC": "LLMC-FALLBACK"}
-# -------------------------
-# Configuration (conservative defaults)
-# -------------------------
 class Config:
     DEFAULT_PROVIDER = "GROQ"
     DEFAULT_MODEL = "llama-3.3-70b-versatile"
     DEFAULT_MAX_TOKENS = 512
     DEFAULT_TEMPERATURE = 0.7
     CHUNK_SIZE = 1000
-    MAX_CONNECTIONS = 50         # lowered to reduce memory pressure
-    HTTP2 = False                # disable unless you need it
     TIMEOUT = 30.0
-    STREAM_BATCH_BYTES = 0       # 0 = stream as tokens arrive
-# -------------------------
-# Provider templates
-# -------------------------
 PROVIDERS: Dict[str, Dict[str, Any]] = {
     "GROQ": {
         "AUTH": True,
         "BASE_URL": "https://api.groq.com/openai/v1/chat/completions",
         "DEFAULT_MODEL": "qwen/qwen3-32b",
         "HEADERS": {"Authorization": "Bearer {API}", "Content-Type": "application/json"},
-        "PAYLOAD": {"model": "{model}", "messages": "{messages}", "temperature": "{temperature}", "max_tokens": "{max_tokens}", "stop": None, "stream": "{stream}"},
     },
     "LLMC": {
         "AUTH": True,
         "BASE_URL": "https://llmchat.in/inference/stream?model={model}",
         "DEFAULT_MODEL": "@cf/meta/llama-3.1-8b-instruct",
-        "HEADERS": {"Content-Type": "application/json", "Accept": "*/*", "Origin": "https://llmchat.in", "Referer": "https://llmchat.in/"},
         "PAYLOAD": {"messages": "{messages}", "stream": "{stream}"},
     },
 }
-# -------------------------
-# Template helpers
-# -------------------------
 _placeholder_re = re.compile(r"\{(.*?)\}")
 def apply_values_to_template(template: Any, values: Dict[str, Any]) -> Any:
@@ -333,7 +334,6 @@ def apply_values_to_template(template: Any, values: Dict[str, Any]) -> Any:
         m = _placeholder_re.fullmatch(template.strip())
         if m:
             return values.get(m.group(1), template)
-        # create safe stringified values for formatting
         str_values = {
             k: (v if isinstance(v, str) else (orjson.dumps(v).decode("utf-8") if not isinstance(v, (int, float, bool, type(None))) else v))
             for k, v in values.items()
@@ -360,9 +360,6 @@ def build_values_from_request(req: "ChatRequest") -> Dict[str, Any]:
         "stream": req.stream,
     }
-# -------------------------
-# Request dataclass
-# -------------------------
 @dataclass
 class ChatRequest:
     api_key: str
@@ -388,15 +385,20 @@ class ChatRequest:
             messages = [messages]
         return ChatRequest(api_key=api_key, messages=messages, model=model, provider=provider, max_tokens=max_tokens, temperature=temperature, stream=stream)
-# -------------------------
-# Upstream Async client
-# -------------------------
 class AsyncUpstreamClient:
     def __init__(self):
         limits = httpx.Limits(max_connections=Config.MAX_CONNECTIONS)
-        # create client lazily with conservative settings
         self._client = httpx.AsyncClient(timeout=Config.TIMEOUT, limits=limits, http2=Config.HTTP2)
     async def close(self):
         await self._client.aclose()
@@ -431,19 +433,12 @@ class AsyncUpstreamClient:
                 if not chunk:
                     continue
                 buf += chunk
-                # parse events separated by double-newline
                 while b"\n\n" in buf:
                     event, buf = buf.split(b"\n\n", 1)
                     for line in event.splitlines():
                         if not line:
                             continue
-                        payload_bytes = None
-                        if line.startswith(b"data:"):
-                            payload_bytes = line[len(b"data:"):].strip()
-                        elif b"data: " in line:
-                            payload_bytes = line[line.find(b"data: ") + 6:].strip()
-                        else:
-                            payload_bytes = line.strip()
                         if payload_bytes == b"[DONE]":
                             if reasoning_open:
                                 yield nd({"response": "\n</think>\n"})
@@ -453,21 +448,23 @@ class AsyncUpstreamClient:
                         try:
                             parsed = orjson.loads(payload_bytes)
                         except Exception:
-                            # not JSON — pass through text
                             try:
                                 txt = payload_bytes.decode("utf-8", errors="ignore")
-                                if txt and txt.strip():
-                                    yield nd({"response": txt})
                             except Exception:
                                 pass
                             continue
-                        # handle nested response content
                         if isinstance(parsed, dict) and "response" in parsed:
                             resp_val = parsed.get("response")
                             if resp_val is None:
                                 continue
-                            # nested JSON string?
                             inner = None
                             try:
                                 if isinstance(resp_val, str):
@@ -481,8 +478,6 @@ class AsyncUpstreamClient:
                             else:
                                 yield nd({"response": str(resp_val)})
                                 continue
-                        # handle choices/delta format
                         if isinstance(parsed, dict) and parsed.get("choices"):
                             try:
                                 c0 = parsed["choices"][0]
@@ -512,11 +507,8 @@ class AsyncUpstreamClient:
                                     continue
                             except Exception:
                                 pass
                         if isinstance(parsed, dict) and self._is_metadata_blob(parsed):
                             continue
-                        # fallback: stream parsed object
                         try:
                             yield nd({"response": parsed})
                         except Exception:
@@ -524,19 +516,11 @@ class AsyncUpstreamClient:
                                 yield nd({"response": str(parsed)})
                             except Exception:
                                 continue
-            # handle leftover buffer at exit
             if buf:
                 for line in buf.splitlines():
                     if not line:
                         continue
-                    payload_bytes = None
-                    if line.startswith(b"data:"):
-                        payload_bytes = line[len(b"data:"):].strip()
-                    elif b"data: " in line:
-                        payload_bytes = line[line.find(b"data: ") + 6:].strip()
-                    else:
-                        payload_bytes = line.strip()
                     if payload_bytes == b"[DONE]":
                         if reasoning_open:
                             yield orjson.dumps({"response": "\n</think>\n"}) + b"\n"
@@ -556,6 +540,11 @@ class AsyncUpstreamClient:
                         resp_val = parsed.get("response")
                         if resp_val is None:
                             continue
                         inner = None
                         try:
                             if isinstance(resp_val, str):
@@ -605,9 +594,6 @@ class AsyncUpstreamClient:
                         except Exception:
                             continue
-# -------------------------
-# ChatService
-# -------------------------
 class ChatService:
     def __init__(self, client: Optional[AsyncUpstreamClient] = None):
         self.client = client or AsyncUpstreamClient()
@@ -621,139 +607,65 @@ class ChatService:
         if not values.get("model"):
             values["model"] = prov.get("DEFAULT_MODEL") or Config.DEFAULT_MODEL
         url = apply_values_to_template(prov.get("BASE_URL", ""), values)
-        headers = {k: (v if isinstance(v, str) else str(v)) for k, v in (self.client._client.headers.items() if hasattr(self.client, "_client") else {})}
-        # properly prepare headers using provider template
-        headers = self.client._client.headers.copy() if hasattr(self.client, "_client") else {}
-        headers.update(self.client._client.headers if hasattr(self.client, "_client") else {})
-        headers = self.client._client.headers.copy() if hasattr(self.client, "_client") else {}
-        # use the provider header template (fill placeholders)
-        headers = self.client._client.headers.copy() if hasattr(self.client, "_client") else {}
-        headers = {}
-        for k, v in prov.get("HEADERS", {}).items():
-            f = apply_values_to_template(v, values)
-            if f is None:
-                continue
-            headers[k] = f if isinstance(f, str) else str(f)
         payload = apply_values_to_template(prov.get("PAYLOAD", {}), values)
         return {"url": url, "headers": headers, "payload": payload}
     async def generate(self, req: ChatRequest) -> str:
         data = self.build_request_for_provider(req)
-        try:
-            result = await self.client.post_json(data["url"], data["headers"], data["payload"])
-        except Exception as e:
-            logger.exception("Upstream generate error: %s", e)
-            raise
         try:
             return result["choices"][0]["message"]["content"]
         except Exception:
             if isinstance(result, dict) and "response" in result:
                 return result["response"]
-            try:
-                return orjson.dumps(result).decode("utf-8")
-            except Exception:
-                return str(result)
     async def generate_stream(self, req: ChatRequest) -> AsyncGenerator[bytes, None]:
         data = self.build_request_for_provider(req)
         async for token_bytes in self.client.stream_post(data["url"], data["headers"], data["payload"]):
             yield token_bytes
-# -------------------------
-# FastAPI app and lifecycle
-# -------------------------
 app = FastAPI(title="High-speed Chat Proxy")
-service: Optional[ChatService] = None
-@app.on_event("startup")
-async def startup_event():
-    global service
-    logger.info("Starting application and initializing upstream client")
-    try:
-        upstream_client = AsyncUpstreamClient()
-        service = ChatService(client=upstream_client)
-        logger.info("Upstream client initialized")
-    except Exception as e:
-        logger.exception("Failed to initialize upstream client: %s", e)
-        # Let the exception bubble so the server fails fast and logs the problem.
-        raise
 @app.on_event("shutdown")
 async def shutdown_event():
-    global service
-    logger.info("Shutting down application and closing upstream client")
-    if service and getattr(service, "client", None):
-        try:
-            await service.client.close()
-            logger.info("Upstream client closed")
-        except Exception:
-            logger.exception("Error closing upstream client")
-# -------------------------
-# Endpoints
-# -------------------------
-@app.get("/")
-async def root():
-    return {"service": "High-speed Chat Proxy", "status": "running"}
-@app.get("/health")
-async def health():
-    return JSONResponse({"status": "ok"})
-@app.get("/v1/models")
-async def models():
-    return {"models": MODEL_NAMES, "default_model": Config.DEFAULT_MODEL}
 @app.post("/v1/chat/completions")
 async def completions(request: Request):
-    try:
-        body = await request.json()
-    except Exception:
-        raise HTTPException(status_code=400, detail="Invalid JSON body")
     req = ChatRequest.from_dict(body)
     if not req.api_key or not req.messages:
         raise HTTPException(status_code=400, detail="api_key and messages required")
-    if service is None:
-        raise HTTPException(status_code=503, detail="service not ready")
     async def streamer():
-        # streaming path
         if req.stream:
             buf = bytearray()
             threshold = Config.STREAM_BATCH_BYTES
-            try:
-                async for chunk_bytes in service.generate_stream(req):
-                    if not chunk_bytes:
-                        continue
-                    buf.extend(chunk_bytes)
-                    if threshold > 0 and len(buf) >= threshold:
-                        yield bytes(buf)
-                        buf.clear()
-                if buf:
                     yield bytes(buf)
-            except Exception as e:
-                logger.exception("Stream error: %s", e)
-                # yield an error as JSON so client sees it
-                try:
-                    yield orjson.dumps({"error": "upstream stream error", "detail": str(e)} ) + b"\n"
-                except Exception:
-                    pass
         else:
-            # non-streaming: call generate and return NDJSON
-            try:
-                text = await service.generate(req)
-                yield orjson.dumps({"response": text}) + b"\n"
-            except Exception as e:
-                logger.exception("Generate error: %s", e)
-                yield orjson.dumps({"error": "upstream generate error", "detail": str(e)}) + b"\n"
     return StreamingResponse(streamer(), media_type="application/x-ndjson", headers={"Cache-Control": "no-cache"})
-# -------------------------
-# If run directly (use single process)
-# -------------------------
-if __name__ == "__main__":
-    import uvicorn
-    # IMPORTANT: run with a single process in platforms like Spaces. Do NOT use --workers there.
-    uvicorn.run("server:app", host="0.0.0.0", port=8000, loop="asyncio", log_level="info", access_log=True)

 import re
 import orjson
 import httpx
 from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import StreamingResponse
     return R
 try:
+    MODEL_NAMES = {"GROQ" : GROQM() , "LLMC" : LLMCM()}
 except Exception:
     MODEL_NAMES = {"GROQ": "GROQ-FALLBACK", "LLMC": "LLMC-FALLBACK"}
 class Config:
     DEFAULT_PROVIDER = "GROQ"
     DEFAULT_MODEL = "llama-3.3-70b-versatile"
     DEFAULT_MAX_TOKENS = 512
     DEFAULT_TEMPERATURE = 0.7
     CHUNK_SIZE = 1000
+    MAX_CONNECTIONS = 200
+    HTTP2 = True
     TIMEOUT = 30.0
+    STREAM_BATCH_BYTES = 0
 PROVIDERS: Dict[str, Dict[str, Any]] = {
     "GROQ": {
         "AUTH": True,
         "BASE_URL": "https://api.groq.com/openai/v1/chat/completions",
         "DEFAULT_MODEL": "qwen/qwen3-32b",
         "HEADERS": {"Authorization": "Bearer {API}", "Content-Type": "application/json"},
+        "PAYLOAD": {
+            "model": "{model}",
+            "messages": "{messages}",
+            "temperature": "{temperature}",
+            "max_tokens": "{max_tokens}",
+            "stop": None,
+            "stream": "{stream}",
+        },
     },
     "LLMC": {
         "AUTH": True,
         "BASE_URL": "https://llmchat.in/inference/stream?model={model}",
         "DEFAULT_MODEL": "@cf/meta/llama-3.1-8b-instruct",
+        "HEADERS": {
+            "Content-Type": "application/json",
+            "Accept": "*/*",
+            "Origin": "https://llmchat.in",
+            "Referer": "https://llmchat.in/",
+        },
         "PAYLOAD": {"messages": "{messages}", "stream": "{stream}"},
     },
 }
 _placeholder_re = re.compile(r"\{(.*?)\}")
 def apply_values_to_template(template: Any, values: Dict[str, Any]) -> Any:
         m = _placeholder_re.fullmatch(template.strip())
         if m:
             return values.get(m.group(1), template)
         str_values = {
             k: (v if isinstance(v, str) else (orjson.dumps(v).decode("utf-8") if not isinstance(v, (int, float, bool, type(None))) else v))
             for k, v in values.items()
         "stream": req.stream,
     }
 @dataclass
 class ChatRequest:
     api_key: str
             messages = [messages]
         return ChatRequest(api_key=api_key, messages=messages, model=model, provider=provider, max_tokens=max_tokens, temperature=temperature, stream=stream)
 class AsyncUpstreamClient:
     def __init__(self):
         limits = httpx.Limits(max_connections=Config.MAX_CONNECTIONS)
         self._client = httpx.AsyncClient(timeout=Config.TIMEOUT, limits=limits, http2=Config.HTTP2)
+    def _prepare_headers(self, headers_template: Dict[str, str], values: Dict[str, Any]) -> Dict[str, str]:
+        headers = {}
+        for k, v in headers_template.items():
+            f = apply_values_to_template(v, values)
+            if f is None:
+                continue
+            headers[k] = f if isinstance(f, str) else str(f)
+        return headers
     async def close(self):
         await self._client.aclose()
                 if not chunk:
                     continue
                 buf += chunk
                 while b"\n\n" in buf:
                     event, buf = buf.split(b"\n\n", 1)
                     for line in event.splitlines():
                         if not line:
                             continue
+                        payload_bytes = line[len(b"data:"):].strip() if line.startswith(b"data:") else (line[line.find(b"data: ") + 6:].strip() if b"data: " in line else line.strip())
                         if payload_bytes == b"[DONE]":
                             if reasoning_open:
                                 yield nd({"response": "\n</think>\n"})
                         try:
                             parsed = orjson.loads(payload_bytes)
                         except Exception:
+                            txt = None
                             try:
                                 txt = payload_bytes.decode("utf-8", errors="ignore")
                             except Exception:
                                 pass
+                            if txt and txt.strip():
+                                yield nd({"response": txt})
                             continue
                         if isinstance(parsed, dict) and "response" in parsed:
                             resp_val = parsed.get("response")
                             if resp_val is None:
                                 continue
+                            if isinstance(resp_val, (bytes, bytearray)):
+                                try:
+                                    resp_val = resp_val.decode("utf-8", errors="ignore")
+                                except Exception:
+                                    continue
                             inner = None
                             try:
                                 if isinstance(resp_val, str):
                             else:
                                 yield nd({"response": str(resp_val)})
                                 continue
                         if isinstance(parsed, dict) and parsed.get("choices"):
                             try:
                                 c0 = parsed["choices"][0]
                                     continue
                             except Exception:
                                 pass
                         if isinstance(parsed, dict) and self._is_metadata_blob(parsed):
                             continue
                         try:
                             yield nd({"response": parsed})
                         except Exception:
                                 yield nd({"response": str(parsed)})
                             except Exception:
                                 continue
             if buf:
                 for line in buf.splitlines():
                     if not line:
                         continue
+                    payload_bytes = line[len(b"data:"):].strip() if line.startswith(b"data:") else (line[line.find(b"data: ") + 6:].strip() if b"data: " in line else line.strip())
                     if payload_bytes == b"[DONE]":
                         if reasoning_open:
                             yield orjson.dumps({"response": "\n</think>\n"}) + b"\n"
                         resp_val = parsed.get("response")
                         if resp_val is None:
                             continue
+                        if isinstance(resp_val, (bytes, bytearray)):
+                            try:
+                                resp_val = resp_val.decode("utf-8", errors="ignore")
+                            except Exception:
+                                continue
                         inner = None
                         try:
                             if isinstance(resp_val, str):
                         except Exception:
                             continue
 class ChatService:
     def __init__(self, client: Optional[AsyncUpstreamClient] = None):
         self.client = client or AsyncUpstreamClient()
         if not values.get("model"):
             values["model"] = prov.get("DEFAULT_MODEL") or Config.DEFAULT_MODEL
         url = apply_values_to_template(prov.get("BASE_URL", ""), values)
+        headers = self.client._prepare_headers(prov.get("HEADERS", {}), values)
         payload = apply_values_to_template(prov.get("PAYLOAD", {}), values)
         return {"url": url, "headers": headers, "payload": payload}
     async def generate(self, req: ChatRequest) -> str:
         data = self.build_request_for_provider(req)
+        result = await self.client.post_json(data["url"], data["headers"], data["payload"])
         try:
             return result["choices"][0]["message"]["content"]
         except Exception:
             if isinstance(result, dict) and "response" in result:
                 return result["response"]
+            return orjson.dumps(result).decode("utf-8")
     async def generate_stream(self, req: ChatRequest) -> AsyncGenerator[bytes, None]:
         data = self.build_request_for_provider(req)
         async for token_bytes in self.client.stream_post(data["url"], data["headers"], data["payload"]):
             yield token_bytes
 app = FastAPI(title="High-speed Chat Proxy")
+service = ChatService()
 @app.on_event("shutdown")
 async def shutdown_event():
+    try:
+        await service.client.close()
+    except Exception:
+        pass
 @app.post("/v1/chat/completions")
 async def completions(request: Request):
+    body = await request.json()
     req = ChatRequest.from_dict(body)
     if not req.api_key or not req.messages:
         raise HTTPException(status_code=400, detail="api_key and messages required")
     async def streamer():
         if req.stream:
             buf = bytearray()
             threshold = Config.STREAM_BATCH_BYTES
+            async for chunk_bytes in service.generate_stream(req):
+                if not chunk_bytes:
+                    continue
+                buf.extend(chunk_bytes)
+                if len(buf) >= threshold:
                     yield bytes(buf)
+                    buf.clear()
+            if buf:
+                yield bytes(buf)
         else:
+            text = await service.generate(req)
+            yield orjson.dumps({"response": text}) + b"\n"
     return StreamingResponse(streamer(), media_type="application/x-ndjson", headers={"Cache-Control": "no-cache"})
+@app.get("/v1/models")
+async def models():
+    return {"models": MODEL_NAMES, "default_model": Config.DEFAULT_MODEL}
+@app.get("/")
+async def root():
+    return {"service": "High-speed Chat Proxy", "status": "running"}