WINTER4000 commited on
Commit
2017fcd
·
verified ·
1 Parent(s): d5f29b6

Strip LLM: dee/server.py

Browse files
Files changed (1) hide show
  1. dee/server.py +10 -208
dee/server.py CHANGED
@@ -763,44 +763,11 @@ def create_app() -> Flask:
763
  threading.Thread(target=_bye, daemon=True).start()
764
  return jsonify({"ok": True})
765
 
766
- # ================================================================ /api/chat
767
- # Thin proxy to the BioMistral-7B Gradio Space at
768
- # winter4000/turingdna-assistant. The model runs on ZeroGPU over there;
769
- # this endpoint serializes the chat history + new message and forwards
770
- # via gradio_client. We keep the assistant in its own Space because
771
- # ZeroGPU requires the Gradio SDK and this app is Flask — splitting
772
- # avoids a full rewrite.
773
- #
774
- # Cold-start UX: the assistant Space sleeps after inactivity. First
775
- # call after sleep takes 10–30 s (Space wake + GPU acquire). The
776
- # frontend surfaces that wait as a "Waking the assistant…" hint;
777
- # this route doesn't block on a Space that's busy starting up.
778
- @app.post("/api/chat")
779
- def chat() -> Response:
780
- body = request.get_json(force=True, silent=True) or {}
781
- message = (body.get("message") or "").strip()
782
- history = body.get("history") or []
783
- if not message:
784
- return jsonify({"error": "empty message"}), 400
785
- # Sanity-cap on history length so a runaway client can't push a
786
- # 10-MB conversation upstream; the assistant's own format_mistral_prompt
787
- # also truncates to 8 turns but defense in depth is cheap here.
788
- if isinstance(history, list) and len(history) > 64:
789
- history = history[-64:]
790
- try:
791
- response_text = _call_assistant(message, history)
792
- return jsonify({"response": response_text})
793
- except _AssistantUnavailable as exc:
794
- return jsonify({
795
- "error": str(exc),
796
- "kind": "assistant_unavailable",
797
- }), 503
798
- except Exception as exc: # noqa: BLE001
799
- logger.exception("Chat proxy failed.")
800
- return jsonify({
801
- "error": f"{type(exc).__name__}: {exc}",
802
- "kind": "internal",
803
- }), 500
804
 
805
  return app
806
 
@@ -808,176 +775,11 @@ def create_app() -> Flask:
808
  # ----------------------------------------------------------------- helpers
809
 
810
 
811
- # ================================================================ ASSISTANT
812
- # Connection details for the BioMistral-7B Gradio Space. We don't import
813
- # gradio_client at module level it's a heavy dependency and the chat
814
- # endpoint isn't the hot path; lazy-import on first call keeps the cold
815
- # start of the Flask server fast. The client instance is cached after
816
- # first construction so subsequent /api/chat calls reuse the connection.
817
-
818
- ASSISTANT_SPACE = os.environ.get(
819
- "TURINGDNA_ASSISTANT_SPACE",
820
- "winter4000/turingdna-assistant",
821
- )
822
- # How long to wait for the assistant Space to wake up from sleep before
823
- # giving up. ZeroGPU cold starts have varied — Mistral-7B over the
824
- # wire is typically 20-45 s. 60 s is generous without being absurd.
825
- ASSISTANT_TIMEOUT_S = float(os.environ.get("TURINGDNA_ASSISTANT_TIMEOUT", "60"))
826
-
827
-
828
- class _AssistantUnavailable(Exception):
829
- """The assistant Space is sleeping, queued, or otherwise not answering.
830
- Mapped to a 503 by /api/chat so the frontend can show a "try again
831
- in a few seconds" hint instead of a generic error toast."""
832
-
833
-
834
- _assistant_client = None
835
- _assistant_client_lock = threading.Lock()
836
-
837
-
838
- def _get_assistant_client():
839
- """Lazy-initialized, thread-safe gradio_client.Client for the
840
- assistant Space. Constructing the Client makes one HTTP round-trip
841
- to fetch the Space's API schema, so we do it once per process.
842
-
843
- HF_TOKEN environment variable: when present, the client authenticates
844
- as that user, which on ZeroGPU means our calls count against THAT
845
- account's GPU quota. Without it, calls are anonymous and get the
846
- smallest tier (~3 min/day total across all anonymous callers).
847
- The Flask Space owner (winter4000) is a PRO subscriber, so setting
848
- HF_TOKEN to a winter4000 token bumps us to PRO-tier quota
849
- (~25 min/day).
850
-
851
- Set it in HF Space settings → Variables and secrets → New secret →
852
- name=HF_TOKEN, value=<your hf_xxx token from huggingface.co/settings/tokens>
853
- """
854
- global _assistant_client
855
- if _assistant_client is not None:
856
- return _assistant_client
857
- with _assistant_client_lock:
858
- if _assistant_client is None:
859
- try:
860
- from gradio_client import Client
861
- except ImportError as exc:
862
- raise _AssistantUnavailable(
863
- "gradio_client not installed on the server — "
864
- "add it to requirements.txt and redeploy."
865
- ) from exc
866
- try:
867
- hf_token = os.environ.get("HF_TOKEN")
868
- if hf_token:
869
- _assistant_client = Client(
870
- ASSISTANT_SPACE, hf_token=hf_token, verbose=False,
871
- )
872
- logger.info("Assistant client authenticated via HF_TOKEN.")
873
- else:
874
- _assistant_client = Client(ASSISTANT_SPACE, verbose=False)
875
- logger.warning(
876
- "Assistant client is ANONYMOUS — set HF_TOKEN env var "
877
- "on this Space to get PRO-tier ZeroGPU quota."
878
- )
879
- except Exception as exc: # noqa: BLE001
880
- raise _AssistantUnavailable(
881
- f"Couldn't connect to the assistant Space "
882
- f"({ASSISTANT_SPACE}): {exc}"
883
- ) from exc
884
- return _assistant_client
885
-
886
-
887
- def _format_history_into_message(message: str, history: list) -> str:
888
- """Embed JUST the last exchange as natural-language context.
889
-
890
- Evolution of this function:
891
-
892
- 1. Original: dumped the whole 8-turn history with [Previous
893
- conversation] / [Current question] markers. Model fixated on
894
- the marker block and re-emitted identity preambles every turn.
895
- 2. Reactionary fix: return message unchanged. Killed identity
896
- loops but broke follow-ups — "How about in DNA?" got an
897
- off-topic answer because the model had no memory of the
898
- previous codon question.
899
- 3. Now: include ONLY the last exchange (1 user + 1 assistant) in
900
- a compact natural-language framing. Just enough context for
901
- "cDNA then" or "what about yeast" to make sense, not enough
902
- for the model to fixate on prior identity preambles.
903
-
904
- Format kept deliberately short and natural — no labeled blocks,
905
- no obvious schema for the model to pattern-match against:
906
-
907
- Earlier in our conversation, you told me "<X>" when I asked
908
- "<Y>". Now I'm asking: <new>
909
-
910
- If history is empty (first turn), just return the message bare so
911
- the model isn't prompted to reference non-existent context.
912
- """
913
- if not history:
914
- return message
915
-
916
- # Find the most recent user→assistant pair.
917
- last_user = None
918
- last_assistant = None
919
- for msg in reversed(history):
920
- if not isinstance(msg, dict):
921
- continue
922
- role = (msg.get("role") or "").lower()
923
- content = (msg.get("content") or "").strip()
924
- if not content:
925
- continue
926
- if role == "assistant" and last_assistant is None:
927
- last_assistant = content
928
- elif role == "user" and last_assistant is not None and last_user is None:
929
- last_user = content
930
- break
931
-
932
- if not last_user or not last_assistant:
933
- return message
934
-
935
- # Trim long previous turns so the prompt doesn't bloat — a 7B model
936
- # has limited attention and we want the actual question to be the
937
- # most salient thing in the window.
938
- if len(last_user) > 400:
939
- last_user = last_user[:400].rstrip() + "…"
940
- if len(last_assistant) > 600:
941
- last_assistant = last_assistant[:600].rstrip() + "…"
942
-
943
- # Natural conversational framing, NOT a labeled block. Tested empirically
944
- # to be the format that gives BioMistral context without making it
945
- # default to re-introducing itself.
946
- return (
947
- f"Earlier in our conversation, you told me \"{last_assistant}\" "
948
- f"when I asked \"{last_user}\". Now I'm asking: {message}"
949
- )
950
-
951
-
952
- def _call_assistant(message: str, history: list) -> str:
953
- """Forward a chat turn to the assistant Space and return the model's
954
- reply as a plain string. Raises _AssistantUnavailable on cold-start
955
- or network problems."""
956
- client = _get_assistant_client()
957
- enriched = _format_history_into_message(message, history)
958
- try:
959
- # Gradio 4.44 ChatInterface auto-API only takes `message`.
960
- # See _format_history_into_message docstring for why we embed
961
- # history inside the message rather than passing it as a
962
- # separate API arg.
963
- result = client.predict(
964
- enriched,
965
- api_name="/chat",
966
- )
967
- except Exception as exc: # noqa: BLE001
968
- # Most failures here are "Space is sleeping, please retry" or
969
- # "queue is full" — all transient. Map to 503 so the frontend
970
- # can present a sensible "try again" message rather than 500.
971
- raise _AssistantUnavailable(
972
- f"Assistant didn't respond: {exc}"
973
- ) from exc
974
- if not isinstance(result, str):
975
- # Gradio chat returns a string when type="messages". Anything
976
- # else is a schema drift on the assistant side.
977
- raise _AssistantUnavailable(
978
- f"Assistant returned an unexpected response shape: {type(result).__name__}"
979
- )
980
- return result
981
 
982
 
983
  _VALID_MODELS = {"small", "medium", "large"}
 
763
  threading.Thread(target=_bye, daemon=True).start()
764
  return jsonify({"ok": True})
765
 
766
+ # NOTE: the /api/chat endpoint (BioMistral-7B proxy via gradio_client to
767
+ # winter4000/turingdna-assistant) was removed on 2026-05-25. The full
768
+ # implementation lives in _llm_backup_2026-05-25/server/server.py.pre-strip
769
+ # alongside the frontend chat panel + WebLLM browser-side LLM. Re-wire
770
+ # when the assistant is ready to ship again.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
771
 
772
  return app
773
 
 
775
  # ----------------------------------------------------------------- helpers
776
 
777
 
778
+ # NOTE: the ASSISTANT block (gradio_client connection to
779
+ # winter4000/turingdna-assistant, _AssistantUnavailable, _get_assistant_client,
780
+ # _format_history_into_message, _call_assistant) was removed on 2026-05-25.
781
+ # Full implementation lives in _llm_backup_2026-05-25/server/server.py.pre-strip
782
+ # and can be restored when the assistant is ready to ship again.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
783
 
784
 
785
  _VALID_MODELS = {"small", "medium", "large"}