AI_Agent_Final

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 21, 2025

Commit

34ec5a2

verified ·

1 Parent(s): 3268902

Update api/clare_core.py

Browse files

Files changed (1) hide show

api/clare_core.py +226 -16

api/clare_core.py CHANGED Viewed

@@ -2,6 +2,8 @@
 import os
 import re
 import math
 from typing import List, Dict, Tuple, Optional
 from docx import Document
@@ -18,12 +20,87 @@ from .config import (
 from langsmith import traceable
 from langsmith.run_helpers import set_run_metadata
 # ----------------------------
-# Speed knobs
 # ----------------------------
-MAX_HISTORY_TURNS = int(os.getenv("CLARE_MAX_HISTORY_TURNS", "4"))  # was 6
-MAX_RAG_CHARS_IN_PROMPT = int(os.getenv("CLARE_MAX_RAG_CHARS", "600"))  # was 1200
-DEFAULT_MAX_OUTPUT_TOKENS = int(os.getenv("CLARE_MAX_OUTPUT_TOKENS", "450"))
 # ---------- syllabus 解析 ----------
@@ -315,21 +392,56 @@ def find_similar_past_question(
     return None
-@traceable(run_type="llm", name="safe_chat_completion")
-def safe_chat_completion(
     model_name: str,
     messages: List[Dict[str, str]],
     lang: str,
     op: str = "chat",
     temperature: float = 0.5,
     max_tokens: Optional[int] = None,
-) -> str:
     preferred_model = model_name or DEFAULT_MODEL
-    last_error: Optional[Exception] = None
     max_tokens = int(max_tokens or DEFAULT_MAX_OUTPUT_TOKENS)
     for attempt in range(2):
         current_model = preferred_model if attempt == 0 else DEFAULT_MODEL
         try:
             resp = client.chat.completions.create(
                 model=current_model,
@@ -337,18 +449,97 @@ def safe_chat_completion(
                 temperature=temperature,
                 max_tokens=max_tokens,
                 timeout=20,
             )
-            return resp.choices[0].message.content or ""
         except Exception as e:
             print(
-                f"[safe_chat_completion][{op}] attempt {attempt+1} "
                 f"failed with model={current_model}: {repr(e)}"
             )
             last_error = e
             if current_model == DEFAULT_MODEL or attempt == 1:
                 break
-    return build_error_message(last_error or Exception("unknown error"), lang, op)
 def build_messages(
@@ -447,18 +638,22 @@ def build_messages(
         }
     )
     if rag_context:
-        rc = rag_context[:MAX_RAG_CHARS_IN_PROMPT]
         messages.append(
             {
                 "role": "system",
                 "content": (
-                    "Relevant excerpts (use as primary grounding):\n\n" + rc
                 ),
             }
         )
-    # Only keep the last N turns for speed
     trimmed_history = history[-MAX_HISTORY_TURNS:] if history else []
     for user, assistant in trimmed_history:
         messages.append({"role": "user", "content": user})
@@ -466,9 +661,24 @@ def build_messages(
             messages.append({"role": "assistant", "content": assistant})
     messages.append({"role": "user", "content": user_message})
     return messages
 @traceable(run_type="chain", name="chat_with_clare")
 def chat_with_clare(
     message: str,
@@ -503,7 +713,7 @@ def chat_with_clare(
         rag_context=rag_context,
     )
-    answer = safe_chat_completion(
         model_name=model_name,
         messages=messages,
         lang=language_preference,
@@ -577,7 +787,7 @@ def summarize_conversation(
     if language_preference == "中文":
         messages.append({"role": "system", "content": "请用中文输出要点总结（bullet points）。"})
-    summary_text = safe_chat_completion(
         model_name=model_name,
         messages=messages,
         lang=language_preference,

 import os
 import re
 import math
+import time
+import json
 from typing import List, Dict, Tuple, Optional
 from docx import Document
 from langsmith import traceable
 from langsmith.run_helpers import set_run_metadata
+# ============================
+# Token helpers (optional tiktoken)
+# ============================
+def _safe_import_tiktoken():
+    try:
+        import tiktoken  # type: ignore
+        return tiktoken
+    except Exception:
+        return None
+def _approx_tokens(text: str) -> int:
+    if not text:
+        return 0
+    return max(1, int(len(text) / 4))
+def _count_text_tokens(text: str, model: str = "") -> int:
+    tk = _safe_import_tiktoken()
+    if tk is None:
+        return _approx_tokens(text)
+    try:
+        enc = tk.encoding_for_model(model) if model else tk.get_encoding("cl100k_base")
+    except Exception:
+        enc = tk.get_encoding("cl100k_base")
+    return len(enc.encode(text or ""))
+def _count_messages_tokens(messages: List[Dict[str, str]], model: str = "") -> int:
+    # engineering approximation for chat messages overhead
+    total = 0
+    for m in messages or []:
+        total += 4
+        total += _count_text_tokens(str(m.get("role", "")), model=model)
+        total += _count_text_tokens(str(m.get("content", "")), model=model)
+    total += 2
+    return total
+def _truncate_to_tokens(text: str, max_tokens: int, model: str = "") -> str:
+    if not text:
+        return text
+    tk = _safe_import_tiktoken()
+    if tk is None:
+        total = _approx_tokens(text)
+        if total <= max_tokens:
+            return text
+        ratio = max_tokens / max(1, total)
+        cut = max(50, min(len(text), int(len(text) * ratio)))
+        s = text[:cut]
+        while _approx_tokens(s) > max_tokens and len(s) > 50:
+            s = s[: int(len(s) * 0.9)]
+        return s
+    try:
+        enc = tk.encoding_for_model(model) if model else tk.get_encoding("cl100k_base")
+    except Exception:
+        enc = tk.get_encoding("cl100k_base")
+    ids = enc.encode(text or "")
+    if len(ids) <= max_tokens:
+        return text
+    return enc.decode(ids[:max_tokens])
 # ----------------------------
+# Speed knobs (HARD LIMITS)
 # ----------------------------
+# 1) history 最近 10 轮
+MAX_HISTORY_TURNS = int(os.getenv("CLARE_MAX_HISTORY_TURNS", "10"))
+# 2) rag 最多 4 条每条 500 tokens 已在 rag_engine.py 实现
+# 这里仅控制“注入到 LLM prompt 的总 tokens”，避免 prompt 爆炸
+MAX_RAG_TOKENS_IN_PROMPT = int(os.getenv("CLARE_MAX_RAG_TOKENS", "2000"))
+# 3) max_new_tokens 默认 384
+DEFAULT_MAX_OUTPUT_TOKENS = int(os.getenv("CLARE_MAX_OUTPUT_TOKENS", "384"))
 # ---------- syllabus 解析 ----------
     return None
+def _log_prompt_token_breakdown(
+    messages: List[Dict[str, str]],
+    system_prompt: str,
+    rag_context: str,
+    trimmed_history: List[Tuple[str, str]],
+    user_message: str,
+    model_name: str,
+):
+    stats = {
+        "system_tokens": _count_text_tokens(system_prompt, model=model_name),
+        "rag_tokens": _count_text_tokens(rag_context or "", model=model_name),
+        "history_tokens": sum(
+            _count_text_tokens(u or "", model=model_name) + _count_text_tokens(a or "", model=model_name)
+            for u, a in (trimmed_history or [])
+        ),
+        "user_tokens": _count_text_tokens(user_message or "", model=model_name),
+        "prompt_tokens_total_est": _count_messages_tokens(messages, model=model_name),
+        "history_turns_kept": len(trimmed_history or []),
+        "max_rag_tokens_in_prompt": MAX_RAG_TOKENS_IN_PROMPT,
+        "max_output_tokens": DEFAULT_MAX_OUTPUT_TOKENS,
+        "model": model_name or DEFAULT_MODEL,
+    }
+    print("[LLM_PROMPT_TOKENS] " + json.dumps(stats, ensure_ascii=False))
+    return stats
+@traceable(run_type="llm", name="safe_chat_completion_profiled")
+def safe_chat_completion_profiled(
     model_name: str,
     messages: List[Dict[str, str]],
     lang: str,
     op: str = "chat",
     temperature: float = 0.5,
     max_tokens: Optional[int] = None,
+) -> Tuple[str, Dict]:
+    """
+    Returns:
+      - answer text
+      - profiling dict {ttft_ms, llm_total_ms, gen_ms, output_tokens_est, tokens_per_sec_est, streaming_used}
+    """
     preferred_model = model_name or DEFAULT_MODEL
     max_tokens = int(max_tokens or DEFAULT_MAX_OUTPUT_TOKENS)
+    last_error: Optional[Exception] = None
     for attempt in range(2):
         current_model = preferred_model if attempt == 0 else DEFAULT_MODEL
+        # 1) Try streaming for real TTFT
+        t0 = time.perf_counter()
         try:
             resp = client.chat.completions.create(
                 model=current_model,
                 temperature=temperature,
                 max_tokens=max_tokens,
                 timeout=20,
+                stream=True,
             )
+            first_token_t = None
+            out_parts: List[str] = []
+            for event in resp:
+                # OpenAI-style: event.choices[0].delta.content
+                try:
+                    delta = event.choices[0].delta.content  # type: ignore
+                except Exception:
+                    delta = None
+                if not delta:
+                    continue
+                if first_token_t is None:
+                    first_token_t = time.perf_counter()
+                out_parts.append(delta)
+            t_end = time.perf_counter()
+            answer = "".join(out_parts)
+            ttft_ms = None if first_token_t is None else (first_token_t - t0) * 1000.0
+            total_ms = (t_end - t0) * 1000.0
+            gen_ms = None if first_token_t is None else (t_end - first_token_t) * 1000.0
+            out_tokens = _count_text_tokens(answer, model=current_model)
+            tokens_per_sec = None
+            if gen_ms and gen_ms > 0:
+                tokens_per_sec = out_tokens / (gen_ms / 1000.0)
+            prof = {
+                "streaming_used": True,
+                "ttft_ms": ttft_ms,
+                "llm_total_ms": total_ms,
+                "gen_ms": gen_ms,
+                "output_tokens_est": out_tokens,
+                "tokens_per_sec_est": tokens_per_sec,
+                "model": current_model,
+                "max_tokens": max_tokens,
+            }
+            print("[LLM_PROFILING] " + json.dumps(prof, ensure_ascii=False))
+            return answer, prof
+        except Exception as e:
+            last_error = e
+            # fall through to non-stream fallback below
+        # 2) Non-stream fallback (TTFT not available; approximate)
+        try:
+            t0 = time.perf_counter()
+            resp2 = client.chat.completions.create(
+                model=current_model,
+                messages=messages,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                timeout=20,
+            )
+            t_end = time.perf_counter()
+            answer = resp2.choices[0].message.content or ""
+            total_ms = (t_end - t0) * 1000.0
+            out_tokens = _count_text_tokens(answer, model=current_model)
+            tokens_per_sec = None
+            if total_ms > 0:
+                tokens_per_sec = out_tokens / (total_ms / 1000.0)
+            prof = {
+                "streaming_used": False,
+                "ttft_ms": None,  # not measurable without stream
+                "llm_total_ms": total_ms,
+                "gen_ms": None,
+                "output_tokens_est": out_tokens,
+                "tokens_per_sec_est": tokens_per_sec,
+                "model": current_model,
+                "max_tokens": max_tokens,
+                "note": "non-stream fallback; ttft_ms unavailable",
+            }
+            print("[LLM_PROFILING] " + json.dumps(prof, ensure_ascii=False))
+            return answer, prof
         except Exception as e:
             print(
+                f"[safe_chat_completion_profiled][{op}] attempt {attempt+1} "
                 f"failed with model={current_model}: {repr(e)}"
             )
             last_error = e
             if current_model == DEFAULT_MODEL or attempt == 1:
                 break
+    return build_error_message(last_error or Exception("unknown error"), lang, op), {
+        "streaming_used": False,
+        "error": repr(last_error) if last_error else "unknown",
+    }
 def build_messages(
         }
     )
+    # RAG context: enforce token cap here (in addition to rag_engine caps)
+    rag_text_for_prompt = ""
     if rag_context:
+        rag_text_for_prompt = _truncate_to_tokens(
+            rag_context, max_tokens=MAX_RAG_TOKENS_IN_PROMPT, model=model_name_or_default(DEFAULT_MODEL)
+        )
         messages.append(
             {
                 "role": "system",
                 "content": (
+                    "Relevant excerpts (use as primary grounding):\n\n" + rag_text_for_prompt
                 ),
             }
         )
+    # Only keep the last N turns for speed (HARD LIMIT)
     trimmed_history = history[-MAX_HISTORY_TURNS:] if history else []
     for user, assistant in trimmed_history:
         messages.append({"role": "user", "content": user})
             messages.append({"role": "assistant", "content": assistant})
     messages.append({"role": "user", "content": user_message})
+    # prompt token breakdown log
+    _log_prompt_token_breakdown(
+        messages=messages,
+        system_prompt=CLARE_SYSTEM_PROMPT,
+        rag_context=rag_text_for_prompt,
+        trimmed_history=trimmed_history,
+        user_message=user_message,
+        model_name=(DEFAULT_MODEL or ""),
+    )
     return messages
+def model_name_or_default(x: str) -> str:
+    return x or DEFAULT_MODEL
 @traceable(run_type="chain", name="chat_with_clare")
 def chat_with_clare(
     message: str,
         rag_context=rag_context,
     )
+    answer, _prof = safe_chat_completion_profiled(
         model_name=model_name,
         messages=messages,
         lang=language_preference,
     if language_preference == "中文":
         messages.append({"role": "system", "content": "请用中文输出要点总结（bullet points）。"})
+    summary_text, _prof = safe_chat_completion_profiled(
         model_name=model_name,
         messages=messages,
         lang=language_preference,