Spaces:

triflix
/

custom_ai

Sleeping

App Files Files Community

triflix commited on 28 days ago

Commit

35eb612

verified ·

1 Parent(s): a0d98a4

Update app.py

Browse files

Files changed (1) hide show

app.py +346 -328

app.py CHANGED Viewed

@@ -1,43 +1,40 @@
-"""
-Gemini CLI → OpenAI-Compatible API Proxy
-Ultra-fast, reliable, with full streaming support.
-Deploy on HuggingFace Spaces (Docker SDK, port 7860).
-"""
 import os
 import json
-import time
 import asyncio
 import logging
-from uuid import uuid4
-from datetime import datetime, timezone
-from contextlib import asynccontextmanager
-from typing import AsyncIterator, Any
 import httpx
-from fastapi import FastAPI, Request, HTTPException, Depends
 from fastapi.responses import StreamingResponse, JSONResponse
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from fastapi.middleware.cors import CORSMiddleware
-from google.oauth2.credentials import Credentials
-from google.auth.transport.requests import Request as GoogleAuthRequest
-# ────────────────────── Logging ──────────────────────
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
-log = logging.getLogger("gemini-proxy")
-# ────────────────────── Config ───────────────────────
-AUTH_PASSWORD  = os.environ.get("GEMINI_AUTH_PASSWORD", "")
-CREDS_JSON     = os.environ.get("GEMINI_CREDENTIALS", "{}")
-CLIENT_ID      = os.environ.get("GEMINI_CLIENT_ID", "")
-CLIENT_SECRET  = os.environ.get("GEMINI_CLIENT_SECRET", "")
-API_BASE       = os.environ.get("GEMINI_API_BASE", "https://cloudcode-pa.googleapis.com")
-# ────────────────────── Globals ──────────────────────
-_http: httpx.AsyncClient | None = None
-_creds: Credentials | None = None
-_lock = asyncio.Lock()
-_sec = HTTPBearer(auto_error=False)
 MODELS = [
     "gemini-2.5-pro",
@@ -51,338 +48,359 @@ MODELS = [
     "gemini-2.5-flash-maxthinking",
 ]
-# ════════════════════ APP LIFESPAN ═══════════════════
-@asynccontextmanager
-async def lifespan(_app: FastAPI):
-    global _http
-    _http = httpx.AsyncClient(
-        timeout=httpx.Timeout(connect=10, read=300, write=30, pool=10),
-        limits=httpx.Limits(max_connections=100, max_keepalive_connections=20),
-        http2=True,
     )
-    log.info("Proxy ready — %d models", len(MODELS))
-    yield
-    await _http.aclose()
-app = FastAPI(title="Gemini OpenAI Proxy", lifespan=lifespan)
-app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
-# ════════════════════ AUTH ═══════════════════════════
-def _auth(c: HTTPAuthorizationCredentials = Depends(_sec)):
-    if not AUTH_PASSWORD:
-        raise HTTPException(500, "Server missing GEMINI_AUTH_PASSWORD")
-    if not c or c.credentials != AUTH_PASSWORD:
-        raise HTTPException(401, "Invalid Bearer token")
-# ════════════════════ TOKEN ══════════════════════════
 async def _token() -> str:
     global _creds
-    async with _lock:
-        if _creds and _creds.valid and not _creds.expired:
-            return _creds.token
-        return await asyncio.to_thread(_refresh)
-def _refresh() -> str:
-    global _creds
-    d = json.loads(CREDS_JSON)
-    cid  = d.get("client_id") or CLIENT_ID
-    csec = d.get("client_secret") or CLIENT_SECRET
-    rtok = d.get("refresh_token")
-    atok = d.get("access_token") or d.get("token")
-    missing = []
-    if not cid:  missing.append("GEMINI_CLIENT_ID")
-    if not csec: missing.append("GEMINI_CLIENT_SECRET")
-    if not rtok: missing.append("refresh_token")
-    if missing:
-        raise HTTPException(500, f"Missing: {', '.join(missing)}")
-    exp = None
-    if "expiry_date" in d:
-        exp = datetime.fromtimestamp(d["expiry_date"] / 1000, tz=timezone.utc)
-    c = Credentials(
-        token=atok, refresh_token=rtok,
-        token_uri=d.get("token_uri", "https://oauth2.googleapis.com/token"),
-        client_id=cid, client_secret=csec, expiry=exp,
-    )
-    if not c.valid or c.expired:
-        c.refresh(GoogleAuthRequest())
-        log.info("Token refreshed → expires %s", c.expiry)
-    _creds = c
-    return c.token
-# ════════════════════ GEMINI HELPERS ═════════════════
-def _parse_model(model: str):
-    """Returns (base_model, use_search, thinking_budget)."""
-    search = model.endswith("-search")
-    no_think = model.endswith("-nothinking")
-    max_think = model.endswith("-maxthinking")
-    base = (model.removesuffix("-search")
-                 .removesuffix("-nothinking")
-                 .removesuffix("-maxthinking"))
-    budget = None
-    if no_think:  budget = 0
-    if max_think: budget = 24576
-    return base, search, budget
-def _to_gemini(messages: list, search: bool, budget, **kw) -> dict:
-    """OpenAI messages → Gemini request body."""
     contents = []
-    sys_parts = []
-    for m in messages:
-        role, text = m.get("role", "user"), m.get("content", "")
-        if role == "system":
-            sys_parts.append({"text": text})
-        else:
-            contents.append({
-                "role": "user" if role == "user" else "model",
-                "parts": [{"text": text}],
-            })
-    body: dict[str, Any] = {"contents": contents}
-    if sys_parts:
-        body["systemInstruction"] = {"parts": sys_parts}
-    gc: dict[str, Any] = {}
-    if kw.get("temperature") is not None: gc["temperature"] = kw["temperature"]
-    if kw.get("max_tokens"):              gc["maxOutputTokens"] = kw["max_tokens"]
-    if kw.get("top_p") is not None:       gc["topP"] = kw["top_p"]
-    if kw.get("stop"):
-        gc["stopSequences"] = kw["stop"] if isinstance(kw["stop"], list) else [kw["stop"]]
-    if budget is not None:
-        gc["thinkingConfig"] = {"thinkingBudget": budget}
-    if gc:
-        body["generationConfig"] = gc
-    if search:
-        body["tools"] = [{"googleSearch": {}}]
-    return body
-# ─── Stream parser: handles both SSE and JSON-array ──
-async def _gemini_stream(url: str, headers: dict, body: dict) -> AsyncIterator[dict]:
-    """Yields individual Gemini response objects from a stream."""
-    sse_url = url + "?alt=sse"
-    async with _http.stream("POST", sse_url, json=body, headers=headers) as r:
-        if r.status_code != 200:
-            err = (await r.aread()).decode(errors="replace")
-            raise HTTPException(r.status_code, f"Gemini: {err[:500]}")
-        ct = r.headers.get("content-type", "")
-        if "text/event-stream" in ct:
-            # ── SSE mode (fast, line-by-line) ──
-            async for line in r.aiter_lines():
-                if not line.startswith("data:"):
-                    continue
-                payload = line[5:].strip()
-                if not payload or payload == "[DONE]":
-                    continue
-                try:
-                    yield json.loads(payload)
-                except json.JSONDecodeError:
-                    continue
         else:
-            # ── JSON-array fallback ──
-            buf = ""
-            async for chunk in r.aiter_text():
-                buf += chunk
-                while True:
-                    buf = buf.lstrip(" \t\n\r,[")
-                    if not buf or buf[0] != "{":
-                        # also strip trailing ] at end of array
-                        buf = buf.lstrip("]")
-                        break
-                    # find matching }
-                    depth = 0
-                    in_s = 0   # 1 = inside string
-                    esc = 0    # 1 = next char is escaped
-                    found = -1
-                    for i, c in enumerate(buf):
-                        if esc:
-                            esc = 0; continue
-                        if c == "\\" and in_s:
-                            esc = 1; continue
-                        if c == '"':
-                            in_s ^= 1; continue
-                        if in_s:
-                            continue
-                        if c == "{": depth += 1
-                        elif c == "}":
-                            depth -= 1
-                            if depth == 0:
-                                found = i; break
-                    if found < 0:
-                        break  # incomplete, need more data
-                    try:
-                        yield json.loads(buf[:found + 1])
-                    except json.JSONDecodeError:
-                        pass
-                    buf = buf[found + 1:]
-def _text(obj: dict) -> str:
-    """Extract non-thought text from Gemini response."""
-    parts = obj.get("candidates", [{}])[0].get("content", {}).get("parts", [])
-    return "".join(p.get("text", "") for p in parts if not p.get("thought"))
-def _usage(obj: dict) -> dict:
-    m = obj.get("usageMetadata", {})
-    return {
-        "prompt_tokens":     m.get("promptTokenCount", 0),
-        "completion_tokens": m.get("candidatesTokenCount", 0),
-        "total_tokens":      m.get("totalTokenCount", 0),
-    }
-_FINISH_MAP = {"STOP": "stop", "MAX_TOKENS": "length", "SAFETY": "content_filter"}
-def _finish(obj: dict) -> str | None:
-    r = obj.get("candidates", [{}])[0].get("finishReason")
-    return _FINISH_MAP.get(r)
-# ════════════════════ ROUTES ═════════════════════════
 @app.get("/")
-async def health():
-    return {"status": "ok", "service": "gemini-openai-proxy", "models": len(MODELS)}
 @app.get("/v1/models")
-async def list_models(_=Depends(_auth)):
     return {
         "object": "list",
-        "data": [{"id": m, "object": "model", "owned_by": "google", "created": 0} for m in MODELS],
     }
 @app.post("/v1/chat/completions")
-async def chat(request: Request, _=Depends(_auth)):
-    body = await request.json()
-    model   = body.get("model", "gemini-2.5-pro")
-    msgs    = body.get("messages", [])
-    stream  = body.get("stream", False)
-    if not msgs:
-        raise HTTPException(400, "messages required")
-    base, search, budget = _parse_model(model)
-    gemini_body = _to_gemini(
-        msgs, search, budget,
-        temperature=body.get("temperature"),
-        max_tokens=body.get("max_tokens") or body.get("max_completion_tokens"),
-        top_p=body.get("top_p"),
-        stop=body.get("stop"),
-    )
     tok = await _token()
-    hdrs = {"Authorization": f"Bearer {tok}", "Content-Type": "application/json"}
-    if stream:
-        url = f"{API_BASE}/v1/models/{base}:streamGenerateContent"
-        return StreamingResponse(
-            _sse_stream(url, hdrs, gemini_body, model),
-            media_type="text/event-stream",
-            headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
-        )
-    # ── Non-streaming ──
-    url = f"{API_BASE}/v1/models/{base}:generateContent"
-    return await _non_stream(url, hdrs, gemini_body, model)
-# ─── Streaming response ─────────────────────────────
-async def _sse_stream(url, hdrs, body, model):
-    cid = f"chatcmpl-{uuid4().hex[:24]}"
-    ts  = int(time.time())
-    # role chunk
-    yield _sse({"id": cid, "object": "chat.completion.chunk", "created": ts, "model": model,
-                "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}]})
-    try:
-        async for obj in _gemini_stream(url, hdrs, body):
-            txt = _text(obj)
-            fin = _finish(obj)
-            if txt:
-                yield _sse({"id": cid, "object": "chat.completion.chunk", "created": ts, "model": model,
-                            "choices": [{"index": 0, "delta": {"content": txt}, "finish_reason": None}]})
-            if fin:
-                yield _sse({"id": cid, "object": "chat.completion.chunk", "created": ts, "model": model,
-                            "choices": [{"index": 0, "delta": {}, "finish_reason": fin}]})
-    except HTTPException as e:
-        # Send error as SSE event so client knows what happened
-        yield _sse({"error": {"message": e.detail, "code": e.status_code}})
-    yield "data: [DONE]\n\n"
-# ─── Non-streaming response ─────────────────────────
-async def _non_stream(url, hdrs, body, model):
-    # Retry once on 401 (expired token)
-    for attempt in range(2):
-        r = await _http.post(url, json=body, headers=hdrs)
-        if r.status_code == 401 and attempt == 0:
-            global _creds
-            async with _lock:
-                _creds = None
-            tok = await _token()
-            hdrs["Authorization"] = f"Bearer {tok}"
-            continue
-        break
-    if r.status_code != 200:
-        raise HTTPException(r.status_code, f"Gemini: {r.text[:500]}")
-    data = r.json()
-    # Handle error in body
-    if "error" in data:
-        e = data["error"]
-        raise HTTPException(e.get("code", 500), e.get("message", "Unknown"))
-    # Gemini may return list or dict
-    if isinstance(data, list):
-        full_text = "".join(_text(item) for item in data)
-        usg = next((_usage(i) for i in data if _usage(i).get("total_tokens")), _usage({}))
-        fin = next((_finish(i) for i in data if _finish(i)), "stop")
     else:
-        full_text = _text(data)
-        usg = _usage(data)
-        fin = _finish(data) or "stop"
-    return JSONResponse({
-        "id": f"chatcmpl-{uuid4().hex[:24]}",
-        "object": "chat.completion",
-        "created": int(time.time()),
-        "model": model,
-        "choices": [{"index": 0, "message": {"role": "assistant", "content": full_text}, "finish_reason": fin}],
-        "usage": usg,
-    })
-def _sse(obj: dict) -> str:
-    return f"data: {json.dumps(obj, ensure_ascii=False)}\n\n"

+# ============================================================
+# DATETIME FIX — Must be first, before any google.auth import
+# ============================================================
+import datetime as _dt
+import google.auth._helpers as _gah
+_gah.utcnow = lambda: _dt.datetime.now(_dt.timezone.utc)
+# ============================================================
 import os
 import json
 import asyncio
 import logging
+import time
+import uuid
 import httpx
+from fastapi import FastAPI, HTTPException, Depends, Request
 from fastapi.responses import StreamingResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Optional, List, Union
+import google.oauth2.credentials
+import google.auth.transport.requests
+# ── Logging ──────────────────────────────────────────────────
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(message)s",
+)
+logger = logging.getLogger(__name__)
+# ── Config from env ──────────────────────────────────────────
+AUTH_PASSWORD   = os.environ.get("GEMINI_AUTH_PASSWORD", "")
+RAW_CREDS       = os.environ.get("GEMINI_CREDENTIALS", "")
+PORT            = int(os.environ.get("PORT", 7860))
+GEMINI_API_BASE = "https://cloudcode-pa.googleapis.com/v1internal/projects/-/locations/-/endpoints/-"
 MODELS = [
     "gemini-2.5-pro",
     "gemini-2.5-flash-maxthinking",
 ]
+# Thinking budgets per model variant
+THINKING_BUDGET = {
+    "gemini-2.5-pro-nothinking":   0,
+    "gemini-2.5-flash-nothinking": 0,
+    "gemini-2.5-pro-maxthinking":  32768,
+    "gemini-2.5-flash-maxthinking":32768,
+}
+# Search grounding models
+SEARCH_MODELS = {"gemini-2.5-pro-search", "gemini-2.5-flash-search"}
+# Base model mapping (strip suffix for API call)
+def base_model(model: str) -> str:
+    for suffix in ["-search", "-nothinking", "-maxthinking"]:
+        if model.endswith(suffix):
+            return model[: -len(suffix)]
+    return model
+# ── Credential management ─────────────────────────────────────
+_creds: Optional[google.oauth2.credentials.Credentials] = None
+_creds_lock = asyncio.Lock()
+def _build_creds() -> google.oauth2.credentials.Credentials:
+    if not RAW_CREDS:
+        raise RuntimeError("GEMINI_CREDENTIALS env var not set")
+    data = json.loads(RAW_CREDS)
+    expiry = None
+    if "expiry_date" in data:
+        # expiry_date is epoch ms from oauth_creds.json
+        ts = data["expiry_date"] / 1000.0
+        expiry = _dt.datetime.fromtimestamp(ts, tz=_dt.timezone.utc)
+    elif "expiry" in data:
+        raw = data["expiry"]
+        if isinstance(raw, (int, float)):
+            expiry = _dt.datetime.fromtimestamp(raw, tz=_dt.timezone.utc)
+        else:
+            expiry = _dt.datetime.fromisoformat(raw)
+            if expiry.tzinfo is None:
+                expiry = expiry.replace(tzinfo=_dt.timezone.utc)
+    c = google.oauth2.credentials.Credentials(
+        token         = data.get("token") or data.get("access_token"),
+        refresh_token = data.get("refresh_token"),
+        token_uri     = data.get("token_uri", "https://oauth2.googleapis.com/token"),
+        client_id     = data.get("client_id"),
+        client_secret = data.get("client_secret"),
+        scopes        = data.get("scopes", ["https://www.googleapis.com/auth/cloud-platform"]),
     )
+    if expiry:
+        c.expiry = expiry
+    return c
+def _refresh(c: google.oauth2.credentials.Credentials):
+    """Synchronously refresh credentials if expired."""
+    now = _dt.datetime.now(_dt.timezone.utc)
+    # Safely check expiry, handle both aware and naive
+    needs_refresh = False
+    if c.token is None:
+        needs_refresh = True
+    elif c.expiry is not None:
+        expiry = c.expiry
+        if expiry.tzinfo is None:
+            expiry = expiry.replace(tzinfo=_dt.timezone.utc)
+        # refresh 5 minutes early
+        needs_refresh = now >= (expiry - _dt.timedelta(minutes=5))
+    if needs_refresh:
+        logger.info("Refreshing Google OAuth token...")
+        request = google.auth.transport.requests.Request()
+        c.refresh(request)
+        logger.info("Token refreshed successfully.")
+    return c.token
 async def _token() -> str:
     global _creds
+    async with _creds_lock:
+        if _creds is None:
+            _creds = _build_creds()
+        token = await asyncio.to_thread(_refresh, _creds)
+    return token
+# ── FastAPI app ───────────────────────────────────────────────
+app = FastAPI(title="geminicli2api", version="1.0.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ── Auth dependency ───────────────────────────────────────────
+async def verify_auth(request: Request):
+    if not AUTH_PASSWORD:
+        return
+    auth = request.headers.get("Authorization", "")
+    if auth.startswith("Bearer "):
+        token = auth[7:]
+    else:
+        token = auth
+    if token != AUTH_PASSWORD:
+        raise HTTPException(status_code=401, detail="Unauthorized")
+# ── Pydantic models ���──────────────────────────────────────────
+class Message(BaseModel):
+    role: str
+    content: Union[str, list]
+class ChatRequest(BaseModel):
+    model: str = "gemini-2.5-flash"
+    messages: List[Message]
+    stream: bool = False
+    max_tokens: Optional[int] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+# ── Conversion helpers ────────────────────────────────────────
+def openai_messages_to_gemini(messages: List[Message]):
+    """Convert OpenAI messages to Gemini contents format."""
+    system_parts = []
     contents = []
+    for msg in messages:
+        role = msg.role
+        content = msg.content
+        if isinstance(content, str):
+            parts = [{"text": content}]
+        elif isinstance(content, list):
+            parts = []
+            for item in content:
+                if isinstance(item, dict):
+                    if item.get("type") == "text":
+                        parts.append({"text": item["text"]})
+                    elif item.get("type") == "image_url":
+                        url = item["image_url"]["url"]
+                        if url.startswith("data:"):
+                            mime, b64 = url[5:].split(";base64,", 1)
+                            parts.append({
+                                "inlineData": {"mimeType": mime, "data": b64}
+                            })
+                        else:
+                            parts.append({"text": f"[Image: {url}]"})
+                else:
+                    parts.append({"text": str(item)})
         else:
+            parts = [{"text": str(content)}]
+        if role == "system":
+            system_parts.extend(parts)
+        elif role == "user":
+            contents.append({"role": "user", "parts": parts})
+        elif role == "assistant":
+            contents.append({"role": "model", "parts": parts})
+    return system_parts, contents
+def build_gemini_payload(req: ChatRequest) -> dict:
+    system_parts, contents = openai_messages_to_gemini(req.messages)
+    payload: dict = {"contents": contents}
+    if system_parts:
+        payload["systemInstruction"] = {"parts": system_parts}
+    gen_config: dict = {}
+    if req.max_tokens:
+        gen_config["maxOutputTokens"] = req.max_tokens
+    if req.temperature is not None:
+        gen_config["temperature"] = req.temperature
+    if req.top_p is not None:
+        gen_config["topP"] = req.top_p
+    model = req.model
+    if model in THINKING_BUDGET:
+        gen_config["thinkingConfig"] = {
+            "thinkingBudget": THINKING_BUDGET[model],
+            "includeThoughts": THINKING_BUDGET[model] > 0,
+        }
+    elif model not in {"gemini-2.0-flash"} and "flash" not in model:
+        # Default thinking for pro models
+        gen_config["thinkingConfig"] = {
+            "thinkingBudget": -1,
+            "includeThoughts": False,
+        }
+    if gen_config:
+        payload["generationConfig"] = gen_config
+    if model in SEARCH_MODELS:
+        payload["tools"] = [{"googleSearch": {}}]
+    return payload
+def gemini_response_to_openai(gemini_resp: dict, model: str, stream: bool = False) -> dict:
+    """Convert Gemini response to OpenAI format."""
+    candidates = gemini_resp.get("candidates", [])
+    text = ""
+    finish_reason = "stop"
+    if candidates:
+        candidate = candidates[0]
+        parts = candidate.get("content", {}).get("parts", [])
+        for part in parts:
+            if "text" in part and not part.get("thought", False):
+                text += part["text"]
+        fr = candidate.get("finishReason", "STOP")
+        finish_reason = {
+            "STOP": "stop",
+            "MAX_TOKENS": "length",
+            "SAFETY": "content_filter",
+        }.get(fr, "stop")
+    usage = gemini_resp.get("usageMetadata", {})
+    prompt_tokens = usage.get("promptTokenCount", 0)
+    completion_tokens = usage.get("candidatesTokenCount", 0)
+    resp_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
+    created = int(time.time())
+    if stream:
+        return {
+            "id": resp_id,
+            "object": "chat.completion.chunk",
+            "created": created,
+            "model": model,
+            "choices": [{
+                "index": 0,
+                "delta": {"content": text},
+                "finish_reason": finish_reason,
+            }],
+        }
+    return {
+        "id": resp_id,
+        "object": "chat.completion",
+        "created": created,
+        "model": model,
+        "choices": [{
+            "index": 0,
+            "message": {"role": "assistant", "content": text},
+            "finish_reason": finish_reason,
+        }],
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": prompt_tokens + completion_tokens,
+        },
+    }
+# ── Routes ────────────────────────────────────────────────────
 @app.get("/")
+async def root():
+    return {"status": "ok", "models": MODELS}
 @app.get("/v1/models")
+async def list_models(_=Depends(verify_auth)):
     return {
         "object": "list",
+        "data": [
+            {
+                "id": m,
+                "object": "model",
+                "created": 1700000000,
+                "owned_by": "google",
+            }
+            for m in MODELS
+        ],
     }
 @app.post("/v1/chat/completions")
+async def chat(req: ChatRequest, _=Depends(verify_auth)):
     tok = await _token()
+    model = req.model
+    api_model = base_model(model)
+    payload = build_gemini_payload(req)
+    headers = {
+        "Authorization": f"Bearer {tok}",
+        "Content-Type": "application/json",
+    }
+    if req.stream:
+        url = f"{GEMINI_API_BASE}:streamGenerateContent?alt=sse&model={api_model}"
+        async def generate():
+            async with httpx.AsyncClient(timeout=120) as client:
+                async with client.stream("POST", url, headers=headers, json=payload) as resp:
+                    if resp.status_code != 200:
+                        body = await resp.aread()
+                        err = body.decode(errors="replace")
+                        logger.error(f"Gemini API error {resp.status_code}: {err}")
+                        yield f"data: {json.dumps({'error': err})}\n\n"
+                        return
+                    buffer = ""
+                    async for chunk in resp.aiter_text():
+                        buffer += chunk
+                        while "\n\n" in buffer:
+                            event, buffer = buffer.split("\n\n", 1)
+                            for line in event.splitlines():
+                                if line.startswith("data: "):
+                                    data_str = line[6:]
+                                    if data_str.strip() == "[DONE]":
+                                        yield "data: [DONE]\n\n"
+                                        return
+                                    try:
+                                        gemini_data = json.loads(data_str)
+                                        openai_chunk = gemini_response_to_openai(
+                                            gemini_data, model, stream=True
+                                        )
+                                        yield f"data: {json.dumps(openai_chunk)}\n\n"
+                                    except json.JSONDecodeError:
+                                        pass
+                    yield "data: [DONE]\n\n"
+        return StreamingResponse(generate(), media_type="text/event-stream")
     else:
+        url = f"{GEMINI_API_BASE}:generateContent?model={api_model}"
+        async with httpx.AsyncClient(timeout=120) as client:
+            resp = await client.post(url, headers=headers, json=payload)
+        if resp.status_code != 200:
+            logger.error(f"Gemini API error {resp.status_code}: {resp.text}")
+            raise HTTPException(status_code=resp.status_code, detail=resp.text)
+        gemini_data = resp.json()
+        return gemini_response_to_openai(gemini_data, model)
+# ── Startup ───────────────────────────────────────────────────
+@app.on_event("startup")
+async def startup():
+    print(f"\n===== Application Startup at {_dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====\n")
+    logger.info(f"Proxy ready — {len(MODELS)} models")
+# ── Main ──────────────────────────────────────────────────────
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=PORT, log_level="info")