Spaces:

WINTER4000
/

syntheogenesis

Running

App Files Files Community

WINTER4000 commited on 4 days ago

Commit

2017fcd

verified ·

1 Parent(s): d5f29b6

Strip LLM: dee/server.py

Browse files

Files changed (1) hide show

dee/server.py +10 -208

dee/server.py CHANGED Viewed

@@ -763,44 +763,11 @@ def create_app() -> Flask:
         threading.Thread(target=_bye, daemon=True).start()
         return jsonify({"ok": True})
-    # ================================================================ /api/chat
-    # Thin proxy to the BioMistral-7B Gradio Space at
-    # winter4000/turingdna-assistant. The model runs on ZeroGPU over there;
-    # this endpoint serializes the chat history + new message and forwards
-    # via gradio_client. We keep the assistant in its own Space because
-    # ZeroGPU requires the Gradio SDK and this app is Flask — splitting
-    # avoids a full rewrite.
-    #
-    # Cold-start UX: the assistant Space sleeps after inactivity. First
-    # call after sleep takes 10–30 s (Space wake + GPU acquire). The
-    # frontend surfaces that wait as a "Waking the assistant…" hint;
-    # this route doesn't block on a Space that's busy starting up.
-    @app.post("/api/chat")
-    def chat() -> Response:
-        body = request.get_json(force=True, silent=True) or {}
-        message = (body.get("message") or "").strip()
-        history = body.get("history") or []
-        if not message:
-            return jsonify({"error": "empty message"}), 400
-        # Sanity-cap on history length so a runaway client can't push a
-        # 10-MB conversation upstream; the assistant's own format_mistral_prompt
-        # also truncates to 8 turns but defense in depth is cheap here.
-        if isinstance(history, list) and len(history) > 64:
-            history = history[-64:]
-        try:
-            response_text = _call_assistant(message, history)
-            return jsonify({"response": response_text})
-        except _AssistantUnavailable as exc:
-            return jsonify({
-                "error": str(exc),
-                "kind": "assistant_unavailable",
-            }), 503
-        except Exception as exc:  # noqa: BLE001
-            logger.exception("Chat proxy failed.")
-            return jsonify({
-                "error": f"{type(exc).__name__}: {exc}",
-                "kind": "internal",
-            }), 500
     return app
@@ -808,176 +775,11 @@ def create_app() -> Flask:
 # ----------------------------------------------------------------- helpers
-# ================================================================ ASSISTANT
-# Connection details for the BioMistral-7B Gradio Space. We don't import
-# gradio_client at module level — it's a heavy dependency and the chat
-# endpoint isn't the hot path; lazy-import on first call keeps the cold
-# start of the Flask server fast. The client instance is cached after
-# first construction so subsequent /api/chat calls reuse the connection.
-ASSISTANT_SPACE = os.environ.get(
-    "TURINGDNA_ASSISTANT_SPACE",
-    "winter4000/turingdna-assistant",
-)
-# How long to wait for the assistant Space to wake up from sleep before
-# giving up. ZeroGPU cold starts have varied — Mistral-7B over the
-# wire is typically 20-45 s. 60 s is generous without being absurd.
-ASSISTANT_TIMEOUT_S = float(os.environ.get("TURINGDNA_ASSISTANT_TIMEOUT", "60"))
-class _AssistantUnavailable(Exception):
-    """The assistant Space is sleeping, queued, or otherwise not answering.
-    Mapped to a 503 by /api/chat so the frontend can show a "try again
-    in a few seconds" hint instead of a generic error toast."""
-_assistant_client = None
-_assistant_client_lock = threading.Lock()
-def _get_assistant_client():
-    """Lazy-initialized, thread-safe gradio_client.Client for the
-    assistant Space. Constructing the Client makes one HTTP round-trip
-    to fetch the Space's API schema, so we do it once per process.
-    HF_TOKEN environment variable: when present, the client authenticates
-    as that user, which on ZeroGPU means our calls count against THAT
-    account's GPU quota. Without it, calls are anonymous and get the
-    smallest tier (~3 min/day total across all anonymous callers).
-    The Flask Space owner (winter4000) is a PRO subscriber, so setting
-    HF_TOKEN to a winter4000 token bumps us to PRO-tier quota
-    (~25 min/day).
-    Set it in HF Space settings → Variables and secrets → New secret →
-    name=HF_TOKEN, value=<your hf_xxx token from huggingface.co/settings/tokens>
-    """
-    global _assistant_client
-    if _assistant_client is not None:
-        return _assistant_client
-    with _assistant_client_lock:
-        if _assistant_client is None:
-            try:
-                from gradio_client import Client
-            except ImportError as exc:
-                raise _AssistantUnavailable(
-                    "gradio_client not installed on the server — "
-                    "add it to requirements.txt and redeploy."
-                ) from exc
-            try:
-                hf_token = os.environ.get("HF_TOKEN")
-                if hf_token:
-                    _assistant_client = Client(
-                        ASSISTANT_SPACE, hf_token=hf_token, verbose=False,
-                    )
-                    logger.info("Assistant client authenticated via HF_TOKEN.")
-                else:
-                    _assistant_client = Client(ASSISTANT_SPACE, verbose=False)
-                    logger.warning(
-                        "Assistant client is ANONYMOUS — set HF_TOKEN env var "
-                        "on this Space to get PRO-tier ZeroGPU quota."
-                    )
-            except Exception as exc:  # noqa: BLE001
-                raise _AssistantUnavailable(
-                    f"Couldn't connect to the assistant Space "
-                    f"({ASSISTANT_SPACE}): {exc}"
-                ) from exc
-        return _assistant_client
-def _format_history_into_message(message: str, history: list) -> str:
-    """Embed JUST the last exchange as natural-language context.
-    Evolution of this function:
-      1. Original: dumped the whole 8-turn history with [Previous
-         conversation] / [Current question] markers. Model fixated on
-         the marker block and re-emitted identity preambles every turn.
-      2. Reactionary fix: return message unchanged. Killed identity
-         loops but broke follow-ups — "How about in DNA?" got an
-         off-topic answer because the model had no memory of the
-         previous codon question.
-      3. Now: include ONLY the last exchange (1 user + 1 assistant) in
-         a compact natural-language framing. Just enough context for
-         "cDNA then" or "what about yeast" to make sense, not enough
-         for the model to fixate on prior identity preambles.
-    Format kept deliberately short and natural — no labeled blocks,
-    no obvious schema for the model to pattern-match against:
-        Earlier in our conversation, you told me "<X>" when I asked
-        "<Y>". Now I'm asking: <new>
-    If history is empty (first turn), just return the message bare so
-    the model isn't prompted to reference non-existent context.
-    """
-    if not history:
-        return message
-    # Find the most recent user→assistant pair.
-    last_user = None
-    last_assistant = None
-    for msg in reversed(history):
-        if not isinstance(msg, dict):
-            continue
-        role = (msg.get("role") or "").lower()
-        content = (msg.get("content") or "").strip()
-        if not content:
-            continue
-        if role == "assistant" and last_assistant is None:
-            last_assistant = content
-        elif role == "user" and last_assistant is not None and last_user is None:
-            last_user = content
-            break
-    if not last_user or not last_assistant:
-        return message
-    # Trim long previous turns so the prompt doesn't bloat — a 7B model
-    # has limited attention and we want the actual question to be the
-    # most salient thing in the window.
-    if len(last_user) > 400:
-        last_user = last_user[:400].rstrip() + "…"
-    if len(last_assistant) > 600:
-        last_assistant = last_assistant[:600].rstrip() + "…"
-    # Natural conversational framing, NOT a labeled block. Tested empirically
-    # to be the format that gives BioMistral context without making it
-    # default to re-introducing itself.
-    return (
-        f"Earlier in our conversation, you told me \"{last_assistant}\" "
-        f"when I asked \"{last_user}\". Now I'm asking: {message}"
-    )
-def _call_assistant(message: str, history: list) -> str:
-    """Forward a chat turn to the assistant Space and return the model's
-    reply as a plain string. Raises _AssistantUnavailable on cold-start
-    or network problems."""
-    client = _get_assistant_client()
-    enriched = _format_history_into_message(message, history)
-    try:
-        # Gradio 4.44 ChatInterface auto-API only takes `message`.
-        # See _format_history_into_message docstring for why we embed
-        # history inside the message rather than passing it as a
-        # separate API arg.
-        result = client.predict(
-            enriched,
-            api_name="/chat",
-        )
-    except Exception as exc:  # noqa: BLE001
-        # Most failures here are "Space is sleeping, please retry" or
-        # "queue is full" — all transient. Map to 503 so the frontend
-        # can present a sensible "try again" message rather than 500.
-        raise _AssistantUnavailable(
-            f"Assistant didn't respond: {exc}"
-        ) from exc
-    if not isinstance(result, str):
-        # Gradio chat returns a string when type="messages". Anything
-        # else is a schema drift on the assistant side.
-        raise _AssistantUnavailable(
-            f"Assistant returned an unexpected response shape: {type(result).__name__}"
-        )
-    return result
 _VALID_MODELS = {"small", "medium", "large"}

         threading.Thread(target=_bye, daemon=True).start()
         return jsonify({"ok": True})
+    # NOTE: the /api/chat endpoint (BioMistral-7B proxy via gradio_client to
+    # winter4000/turingdna-assistant) was removed on 2026-05-25. The full
+    # implementation lives in _llm_backup_2026-05-25/server/server.py.pre-strip
+    # alongside the frontend chat panel + WebLLM browser-side LLM. Re-wire
+    # when the assistant is ready to ship again.
     return app
 # ----------------------------------------------------------------- helpers
+# NOTE: the ASSISTANT block (gradio_client connection to
+# winter4000/turingdna-assistant, _AssistantUnavailable, _get_assistant_client,
+# _format_history_into_message, _call_assistant) was removed on 2026-05-25.
+# Full implementation lives in _llm_backup_2026-05-25/server/server.py.pre-strip
+# and can be restored when the assistant is ready to ship again.
 _VALID_MODELS = {"small", "medium", "large"}