# clare_core.py import re import math from typing import List, Dict, Tuple, Optional from docx import Document from .config import ( client, DEFAULT_MODEL, EMBEDDING_MODEL, DEFAULT_COURSE_TOPICS, CLARE_SYSTEM_PROMPT, LEARNING_MODE_INSTRUCTIONS, ) from langsmith import traceable from langsmith.run_helpers import set_run_metadata # ---------- syllabus 解析 ---------- def parse_syllabus_docx(file_path: str, max_lines: int = 15) -> List[str]: """ 非常简单的 syllabus 解析:取前若干个非空段落当作主题行。 只是为了给 Clare 一些课程上下文,不追求超精确结构。 """ topics: List[str] = [] try: doc = Document(file_path) for para in doc.paragraphs: text = para.text.strip() if not text: continue topics.append(text) if len(topics) >= max_lines: break except Exception as e: topics = [f"[Error parsing syllabus: {e}]"] return topics # ---------- 简单“弱项”检测 ---------- WEAKNESS_KEYWORDS = [ "don't understand", "do not understand", "not understand", "not sure", "confused", "hard to", "difficult", "struggle", "不会", "不懂", "看不懂", "搞不清", "很难", ] # ---------- 简单“掌握”检测 ---------- MASTERY_KEYWORDS = [ "got it", "makes sense", "now i see", "i see", "understand now", "clear now", "easy", "no problem", "没问题", "懂了", "明白了", "清楚了", ] def update_weaknesses_from_message(message: str, weaknesses: List[str]) -> List[str]: lower_msg = message.lower() if any(k in lower_msg for k in WEAKNESS_KEYWORDS): weaknesses = weaknesses or [] weaknesses.append(message) return weaknesses def update_cognitive_state_from_message( message: str, state: Optional[Dict[str, int]], ) -> Dict[str, int]: """ 简单认知状态统计: - 遇到困惑类关键词 → confusion +1 - 遇到掌握类关键词 → mastery +1 """ if state is None: state = {"confusion": 0, "mastery": 0} lower_msg = message.lower() if any(k in lower_msg for k in WEAKNESS_KEYWORDS): state["confusion"] = state.get("confusion", 0) + 1 if any(k in lower_msg for k in MASTERY_KEYWORDS): state["mastery"] = state.get("mastery", 0) + 1 return state def describe_cognitive_state(state: Optional[Dict[str, int]]) -> str: if not state: return "unknown" confusion = state.get("confusion", 0) mastery = state.get("mastery", 0) if confusion >= 2 and confusion >= mastery + 1: return "student shows signs of HIGH cognitive load (often confused)." elif mastery >= 2 and mastery >= confusion + 1: return "student seems COMFORTABLE; material may be slightly easy." else: return "mixed or uncertain cognitive state." # ---------- Session Memory ---------- def build_session_memory_summary( history: List[Tuple[str, str]], weaknesses: Optional[List[str]], cognitive_state: Optional[Dict[str, int]], max_questions: int = 4, max_weaknesses: int = 3, ) -> str: """ 只在本次会话内使用的“记忆摘要”: - 最近几条学生提问 - 最近几条学生觉得难的问题 - 当前的认知状态描述 """ parts: List[str] = [] # 最近几条提问(只取 student) if history: recent_qs = [u for (u, _a) in history[-max_questions:]] trimmed_qs = [] for q in recent_qs: q = q.strip() if len(q) > 120: q = q[:117] + "..." trimmed_qs.append(q) if trimmed_qs: parts.append("Recent student questions: " + " | ".join(trimmed_qs)) # 最近几条“弱项” if weaknesses: recent_weak = weaknesses[-max_weaknesses:] trimmed_weak = [] for w in recent_weak: w = w.strip() if len(w) > 120: w = w[:117] + "..." trimmed_weak.append(w) parts.append("Recent difficulties mentioned by the student: " + " | ".join(trimmed_weak)) # 当前认知状态 if cognitive_state: parts.append("Current cognitive state: " + describe_cognitive_state(cognitive_state)) if not parts: return ( "No prior session memory. You can treat this as an early stage of the conversation; " "start with simple explanations and ask a quick check-up question." ) return " | ".join(parts) # ---------- 语言检测(用于 Auto 模式) ---------- def detect_language(message: str, preference: str) -> str: """ preference: - 'English' → 强制英文 - '中文' → 强制中文 - 'Auto' → 检测文本是否包含中文字符 """ if preference in ("English", "中文"): return preference # Auto 模式下简单检测是否含有中文字符 if re.search(r"[\u4e00-\u9fff]", message): return "中文" return "English" def get_empty_input_prompt(lang: str) -> str: """ 空输入时的友好提示,根据语言返回中/英文。 """ if lang == "中文": return "请先输入一个问题或想法,再按回车发送,我才能帮到你哦。" # 默认英文 return "Please type a question or some text before sending, then hit Enter." def build_error_message( e: Exception, lang: str, op: str = "chat", ) -> str: """ 针对不同操作类型(普通对话 / quiz / summary)和语言,生成友好的错误提示。 不把原始异常直接暴露给学生,只在后台打印。 """ if lang == "中文": prefix = { "chat": "抱歉,刚刚在和模型对话时出现了一点问题。", "quiz": "抱歉,刚刚在生成测验题目时出现了一点问题。", "summary": "抱歉,刚刚在生成总结时出现了一点问题。", }.get(op, "抱歉,刚刚出现了一点问题。") return prefix + " 请稍后再试一次,或者换个问法试试。" # 默认英文 prefix_en = { "chat": "Sorry, I ran into a problem while talking to the model.", "quiz": "Sorry, there was a problem while generating the quiz.", "summary": "Sorry, there was a problem while generating the summary.", }.get(op, "Sorry, something went wrong just now.") return prefix_en + " Please try again in a moment or rephrase your request." # ---------- Session 状态展示 ---------- def render_session_status( learning_mode: str, weaknesses: Optional[List[str]], cognitive_state: Optional[Dict[str, int]], ) -> str: lines: List[str] = [] lines.append("### Session status\n") lines.append(f"- Learning mode: **{learning_mode}**") lines.append(f"- Cognitive state: {describe_cognitive_state(cognitive_state)}") if weaknesses: lines.append("- Recent difficulties (last 3):") for w in weaknesses[-3:]: lines.append(f" - {w}") else: lines.append("- Recent difficulties: *(none yet)*") return "\n".join(lines) # ---------- Same Question Check helpers ---------- def _normalize_text(text: str) -> str: """ 将文本转为小写、去除标点和多余空格,用于简单相似度计算。 """ text = text.lower().strip() text = re.sub(r"[^\w\s]", " ", text) text = re.sub(r"\s+", " ", text) return text def _jaccard_similarity(a: str, b: str) -> float: tokens_a = set(a.split()) tokens_b = set(b.split()) if not tokens_a or not tokens_b: return 0.0 return len(tokens_a & tokens_b) / len(tokens_a | tokens_b) def cosine_similarity(a: List[float], b: List[float]) -> float: if not a or not b or len(a) != len(b): return 0.0 dot = sum(x * y for x, y in zip(a, b)) norm_a = math.sqrt(sum(x * x for x in a)) norm_b = math.sqrt(sum(y * y for y in b)) if norm_a == 0 or norm_b == 0: return 0.0 return dot / (norm_a * norm_b) @traceable(run_type="embedding", name="get_embedding") def get_embedding(text: str) -> Optional[List[float]]: """ 调用 OpenAI Embedding API,将文本编码为向量。 """ try: resp = client.embeddings.create( model=EMBEDDING_MODEL, input=[text], ) return resp.data[0].embedding except Exception as e: # 打到 Space 的 log,便于排查 print(f"[Embedding error] {repr(e)}") return None def find_similar_past_question( message: str, history: List[Tuple[str, str]], jaccard_threshold: float = 0.65, embedding_threshold: float = 0.85, max_turns_to_check: int = 6, ) -> Optional[Tuple[str, str, float]]: """ 在最近若干轮历史对话中查找与当前问题相似的既往问题。 两级检测:先 Jaccard,再 Embedding。 返回 (past_question, past_answer, similarity_score) 或 None """ norm_msg = _normalize_text(message) if not norm_msg: return None # 1) Jaccard best_sim_j = 0.0 best_pair_j: Optional[Tuple[str, str]] = None checked = 0 for user_q, assistant_a in reversed(history): checked += 1 if checked > max_turns_to_check: break norm_hist_q = _normalize_text(user_q) if not norm_hist_q: continue if norm_msg == norm_hist_q: return user_q, assistant_a, 1.0 sim_j = _jaccard_similarity(norm_msg, norm_hist_q) if sim_j > best_sim_j: best_sim_j = sim_j best_pair_j = (user_q, assistant_a) if best_pair_j and best_sim_j >= jaccard_threshold: return best_pair_j[0], best_pair_j[1], best_sim_j # 2) Embedding 语义相似度 if not history: return None msg_emb = get_embedding(message) if msg_emb is None: return None best_sim_e = 0.0 best_pair_e: Optional[Tuple[str, str]] = None checked = 0 for user_q, assistant_a in reversed(history): checked += 1 if checked > max_turns_to_check: break hist_emb = get_embedding(user_q) if hist_emb is None: continue sim_e = cosine_similarity(msg_emb, hist_emb) if sim_e > best_sim_e: best_sim_e = sim_e best_pair_e = (user_q, assistant_a) if best_pair_e and best_sim_e >= embedding_threshold: return best_pair_e[0], best_pair_e[1], best_sim_e return None @traceable(run_type="llm", name="safe_chat_completion") def safe_chat_completion( model_name: str, messages: List[Dict[str, str]], lang: str, op: str = "chat", temperature: float = 0.5, ) -> str: """ 统一安全调用 OpenAI Chat Completion: - 最多尝试 2 次 - 每次请求 timeout = 20 秒 - 第一次用学生选择的模型;出错后,如果不是 DEFAULT_MODEL,则自动回退到 DEFAULT_MODEL 再试一次 - 所有异常都会打印到后台 log,但对学生只返回友好的中/英文错误文案 """ preferred_model = model_name or DEFAULT_MODEL last_error: Optional[Exception] = None for attempt in range(2): # 第一次用学生指定模型,第二次(如果需要)切到默认模型 if attempt == 0: current_model = preferred_model else: current_model = DEFAULT_MODEL try: resp = client.chat.completions.create( model=current_model, messages=messages, temperature=temperature, timeout=20, # 20 秒超时 ) return resp.choices[0].message.content except Exception as e: print( f"[safe_chat_completion][{op}] attempt {attempt+1} " f"failed with model={current_model}: {repr(e)}" ) last_error = e # 如果已经用的是默认模型,或者已经是第二次尝试,就跳出循环 if current_model == DEFAULT_MODEL or attempt == 1: break # 两次都失败,返回友好的错误文案 return build_error_message(last_error or Exception("unknown error"), lang, op) # ---------- 构建 messages ---------- def build_messages( user_message: str, history: List[Tuple[str, str]], language_preference: str, learning_mode: str, doc_type: str, course_outline: Optional[List[str]], weaknesses: Optional[List[str]], cognitive_state: Optional[Dict[str, int]], rag_context: Optional[str] = None, # 新增:RAG 检索结果 ) -> List[Dict[str, str]]: messages: List[Dict[str, str]] = [ {"role": "system", "content": CLARE_SYSTEM_PROMPT} ] # 学习模式 if learning_mode in LEARNING_MODE_INSTRUCTIONS: mode_instruction = LEARNING_MODE_INSTRUCTIONS[learning_mode] messages.append( { "role": "system", "content": f"Current learning mode: {learning_mode}. {mode_instruction}", } ) # 课程大纲 topics = course_outline if course_outline else DEFAULT_COURSE_TOPICS topics_text = " | ".join(topics) messages.append( { "role": "system", "content": ( "Here is the course syllabus context. Use this to stay aligned " "with the course topics when answering: " + topics_text ), } ) # 上传文件类型提示 if doc_type and doc_type != "Syllabus": messages.append( { "role": "system", "content": ( f"The student also uploaded a {doc_type} document as supporting material. " "You do not see the full content directly, but you may assume it is relevant " "to the same course and topics." ), } ) # 学生弱项提示 if weaknesses: weak_text = " | ".join(weaknesses[-5:]) messages.append( { "role": "system", "content": ( "The student seems to struggle with the following questions or topics. " "Be extra gentle and clear when these appear: " + weak_text ), } ) # 认知状态提示 if cognitive_state: confusion = cognitive_state.get("confusion", 0) mastery = cognitive_state.get("mastery", 0) if confusion >= 2 and confusion >= mastery + 1: messages.append( { "role": "system", "content": ( "The student is currently under HIGH cognitive load. " "Use simpler language, shorter steps, and more concrete examples. " "Avoid long derivations in a single answer, and check understanding " "frequently." ), } ) elif mastery >= 2 and mastery >= confusion + 1: messages.append( { "role": "system", "content": ( "The student seems comfortable with the material. " "You may increase difficulty slightly, introduce deeper follow-up " "questions, and connect concepts across topics." ), } ) else: messages.append( { "role": "system", "content": ( "The student's cognitive state is mixed or uncertain. " "Keep explanations clear and moderately paced, and probe for " "understanding with short questions." ), } ) # 语言偏好控制 if language_preference == "English": messages.append( {"role": "system", "content": "Please answer in English."} ) elif language_preference == "中文": messages.append( {"role": "system", "content": "请用中文回答学生的问题。"} ) # Session 内记忆摘要 session_memory_text = build_session_memory_summary( history=history, weaknesses=weaknesses, cognitive_state=cognitive_state, ) messages.append( { "role": "system", "content": ( "Here is a short summary of this session's memory (only within the current chat; " "it is not persisted across sessions). Use it to stay consistent with the " "student's previous questions, difficulties, and cognitive state: " + session_memory_text ), } ) # RAG 检索结果 if rag_context: messages.append( { "role": "system", "content": ( "Here are some relevant excerpts from the course materials. " "Use them as the primary factual grounding when answering the student's question. " "If there is any conflict between these excerpts and your prior knowledge, " "prefer the excerpts.\n\n" + rag_context ), } ) # 历史对话 for user, assistant in history: messages.append({"role": "user", "content": user}) if assistant is not None: messages.append({"role": "assistant", "content": assistant}) # 当前输入 messages.append({"role": "user", "content": user_message}) return messages # 装饰器 @traceable(run_type="chain", name="chat_with_clare") def chat_with_clare( message: str, history: List[Tuple[str, str]], model_name: str, language_preference: str, learning_mode: str, doc_type: str, course_outline: Optional[List[str]], weaknesses: Optional[List[str]], cognitive_state: Optional[Dict[str, int]], rag_context: Optional[str] = None, ) -> Tuple[str, List[Tuple[str, str]]]: try: set_run_metadata( learning_mode=learning_mode, language_preference=language_preference, doc_type=doc_type, ) except Exception as e: print(f"[LangSmith metadata error in chat_with_clare] {repr(e)}") # 构建 messages messages = build_messages( user_message=message, history=history, language_preference=language_preference, learning_mode=learning_mode, doc_type=doc_type, course_outline=course_outline, weaknesses=weaknesses, cognitive_state=cognitive_state, rag_context=rag_context, ) # 统一安全调用 answer = safe_chat_completion( model_name=model_name, messages=messages, lang=language_preference, op="chat", temperature=0.5, ) history = history + [(message, answer)] return answer, history # ---------- 导出对话为 Markdown ---------- def export_conversation( history: List[Tuple[str, str]], course_outline: List[str], learning_mode_val: str, weaknesses: List[str], cognitive_state: Optional[Dict[str, int]], ) -> str: lines: List[str] = [] lines.append("# Clare – Conversation Export\n") lines.append(f"- Learning mode: **{learning_mode_val}**\n") lines.append("- Course topics (short): " + "; ".join(course_outline[:5]) + "\n") lines.append(f"- Cognitive state snapshot: {describe_cognitive_state(cognitive_state)}\n") if weaknesses: lines.append("- Observed student difficulties:\n") for w in weaknesses[-5:]: lines.append(f" - {w}\n") lines.append("\n---\n\n") for user, assistant in history: lines.append(f"**Student:** {user}\n\n") lines.append(f"**Clare:** {assistant}\n\n") lines.append("---\n\n") return "".join(lines) # ---------- 生成 3 个 quiz 题目 ---------- from langsmith import traceable @traceable(run_type="chain", name="generate_quiz_from_history") def generate_quiz_from_history( history: List[Tuple[str, str]], course_outline: List[str], weaknesses: List[str], cognitive_state: Optional[Dict[str, int]], model_name: str, language_preference: str, ) -> str: conversation_text = "" for user, assistant in history[-8:]: conversation_text += f"Student: {user}\nClare: {assistant}\n" topics_text = "; ".join(course_outline[:8]) weakness_text = "; ".join(weaknesses[-5:]) if weaknesses else "N/A" cog_text = describe_cognitive_state(cognitive_state) messages = [ {"role": "system", "content": CLARE_SYSTEM_PROMPT}, { "role": "system", "content": ( "Now your task is to create a **short concept quiz** for the student. " "Based on the conversation and course topics, generate **3 questions** " "(a mix of multiple-choice and short-answer is fine). After listing the " "questions, provide an answer key at the end under a heading 'Answer Key'. " "Number the questions Q1, Q2, Q3. Adjust the difficulty according to the " "student's cognitive state." ), }, { "role": "system", "content": f"Course topics: {topics_text}", }, { "role": "system", "content": f"Student known difficulties: {weakness_text}", }, { "role": "system", "content": f"Student cognitive state: {cog_text}", }, { "role": "user", "content": ( "Here is the recent conversation between you and the student:\n\n" + conversation_text + "\n\nPlease create the quiz now." ), }, ] if language_preference == "中文": messages.append( { "role": "system", "content": "请用中文给出问题和答案。", } ) quiz_text = safe_chat_completion( model_name=model_name, messages=messages, lang=language_preference, op="quiz", temperature=0.5, ) return quiz_text # ---------- 概念总结(知识点摘要) ---------- @traceable(run_type="chain", name="summarize_conversation") def summarize_conversation( history: List[Tuple[str, str]], course_outline: List[str], weaknesses: List[str], cognitive_state: Optional[Dict[str, int]], model_name: str, language_preference: str, ) -> str: conversation_text = "" for user, assistant in history[-10:]: conversation_text += f"Student: {user}\nClare: {assistant}\n" topics_text = "; ".join(course_outline[:8]) weakness_text = "; ".join(weaknesses[-5:]) if weaknesses else "N/A" cog_text = describe_cognitive_state(cognitive_state) messages = [ {"role": "system", "content": CLARE_SYSTEM_PROMPT}, { "role": "system", "content": ( "Your task now is to produce a **concept-only summary** of this tutoring " "session. Only include knowledge points, definitions, key formulas, " "examples, and main takeaways. Do **not** include any personal remarks, " "jokes, or off-topic chat. Write in clear bullet points. This summary " "should be suitable for the student to paste into their study notes. " "Take into account what the student struggled with and their cognitive state." ), }, { "role": "system", "content": f"Course topics context: {topics_text}", }, { "role": "system", "content": f"Student known difficulties: {weakness_text}", }, { "role": "system", "content": f"Student cognitive state: {cog_text}", }, { "role": "user", "content": ( "Here is the recent conversation between you and the student:\n\n" + conversation_text + "\n\nPlease summarize only the concepts and key ideas learned." ), }, ] if language_preference == "中文": messages.append( { "role": "system", "content": "请用中文给出要点总结,只保留知识点和结论,使用条目符号。" } ) summary_text = safe_chat_completion( model_name=model_name, messages=messages, lang=language_preference, op="summary", temperature=0.4, ) return summary_text