test_AI_Agent / api /clare_core.py
SarahXia0405's picture
Update api/clare_core.py
0e36683 verified
# clare_core.py
import re
import math
from typing import List, Dict, Tuple, Optional
from docx import Document
from .config import (
client,
DEFAULT_MODEL,
EMBEDDING_MODEL,
DEFAULT_COURSE_TOPICS,
CLARE_SYSTEM_PROMPT,
LEARNING_MODE_INSTRUCTIONS,
)
from langsmith import traceable
from langsmith.run_helpers import set_run_metadata
# ---------- syllabus 解析 ----------
def parse_syllabus_docx(file_path: str, max_lines: int = 15) -> List[str]:
"""
非常简单的 syllabus 解析:取前若干个非空段落当作主题行。
只是为了给 Clare 一些课程上下文,不追求超精确结构。
"""
topics: List[str] = []
try:
doc = Document(file_path)
for para in doc.paragraphs:
text = para.text.strip()
if not text:
continue
topics.append(text)
if len(topics) >= max_lines:
break
except Exception as e:
topics = [f"[Error parsing syllabus: {e}]"]
return topics
# ---------- 简单“弱项”检测 ----------
WEAKNESS_KEYWORDS = [
"don't understand",
"do not understand",
"not understand",
"not sure",
"confused",
"hard to",
"difficult",
"struggle",
"不会",
"不懂",
"看不懂",
"搞不清",
"很难",
]
# ---------- 简单“掌握”检测 ----------
MASTERY_KEYWORDS = [
"got it",
"makes sense",
"now i see",
"i see",
"understand now",
"clear now",
"easy",
"no problem",
"没问题",
"懂了",
"明白了",
"清楚了",
]
def update_weaknesses_from_message(message: str, weaknesses: List[str]) -> List[str]:
lower_msg = message.lower()
if any(k in lower_msg for k in WEAKNESS_KEYWORDS):
weaknesses = weaknesses or []
weaknesses.append(message)
return weaknesses
def update_cognitive_state_from_message(
message: str,
state: Optional[Dict[str, int]],
) -> Dict[str, int]:
"""
简单认知状态统计:
- 遇到困惑类关键词 → confusion +1
- 遇到掌握类关键词 → mastery +1
"""
if state is None:
state = {"confusion": 0, "mastery": 0}
lower_msg = message.lower()
if any(k in lower_msg for k in WEAKNESS_KEYWORDS):
state["confusion"] = state.get("confusion", 0) + 1
if any(k in lower_msg for k in MASTERY_KEYWORDS):
state["mastery"] = state.get("mastery", 0) + 1
return state
def describe_cognitive_state(state: Optional[Dict[str, int]]) -> str:
if not state:
return "unknown"
confusion = state.get("confusion", 0)
mastery = state.get("mastery", 0)
if confusion >= 2 and confusion >= mastery + 1:
return "student shows signs of HIGH cognitive load (often confused)."
elif mastery >= 2 and mastery >= confusion + 1:
return "student seems COMFORTABLE; material may be slightly easy."
else:
return "mixed or uncertain cognitive state."
# ---------- Session Memory ----------
def build_session_memory_summary(
history: List[Tuple[str, str]],
weaknesses: Optional[List[str]],
cognitive_state: Optional[Dict[str, int]],
max_questions: int = 4,
max_weaknesses: int = 3,
) -> str:
"""
只在本次会话内使用的“记忆摘要”:
- 最近几条学生提问
- 最近几条学生觉得难的问题
- 当前的认知状态描述
"""
parts: List[str] = []
# 最近几条提问(只取 student)
if history:
recent_qs = [u for (u, _a) in history[-max_questions:]]
trimmed_qs = []
for q in recent_qs:
q = q.strip()
if len(q) > 120:
q = q[:117] + "..."
trimmed_qs.append(q)
if trimmed_qs:
parts.append("Recent student questions: " + " | ".join(trimmed_qs))
# 最近几条“弱项”
if weaknesses:
recent_weak = weaknesses[-max_weaknesses:]
trimmed_weak = []
for w in recent_weak:
w = w.strip()
if len(w) > 120:
w = w[:117] + "..."
trimmed_weak.append(w)
parts.append("Recent difficulties mentioned by the student: " + " | ".join(trimmed_weak))
# 当前认知状态
if cognitive_state:
parts.append("Current cognitive state: " + describe_cognitive_state(cognitive_state))
if not parts:
return (
"No prior session memory. You can treat this as an early stage of the conversation; "
"start with simple explanations and ask a quick check-up question."
)
return " | ".join(parts)
# ---------- 语言检测(用于 Auto 模式) ----------
def detect_language(message: str, preference: str) -> str:
"""
preference:
- 'English' → 强制英文
- '中文' → 强制中文
- 'Auto' → 检测文本是否包含中文字符
"""
if preference in ("English", "中文"):
return preference
# Auto 模式下简单检测是否含有中文字符
if re.search(r"[\u4e00-\u9fff]", message):
return "中文"
return "English"
def get_empty_input_prompt(lang: str) -> str:
"""
空输入时的友好提示,根据语言返回中/英文。
"""
if lang == "中文":
return "请先输入一个问题或想法,再按回车发送,我才能帮到你哦。"
# 默认英文
return "Please type a question or some text before sending, then hit Enter."
def build_error_message(
e: Exception,
lang: str,
op: str = "chat",
) -> str:
"""
针对不同操作类型(普通对话 / quiz / summary)和语言,生成友好的错误提示。
不把原始异常直接暴露给学生,只在后台打印。
"""
if lang == "中文":
prefix = {
"chat": "抱歉,刚刚在和模型对话时出现了一点问题。",
"quiz": "抱歉,刚刚在生成测验题目时出现了一点问题。",
"summary": "抱歉,刚刚在生成总结时出现了一点问题。",
}.get(op, "抱歉,刚刚出现了一点问题。")
return prefix + " 请稍后再试一次,或者换个问法试试。"
# 默认英文
prefix_en = {
"chat": "Sorry, I ran into a problem while talking to the model.",
"quiz": "Sorry, there was a problem while generating the quiz.",
"summary": "Sorry, there was a problem while generating the summary.",
}.get(op, "Sorry, something went wrong just now.")
return prefix_en + " Please try again in a moment or rephrase your request."
# ---------- Session 状态展示 ----------
def render_session_status(
learning_mode: str,
weaknesses: Optional[List[str]],
cognitive_state: Optional[Dict[str, int]],
) -> str:
lines: List[str] = []
lines.append("### Session status\n")
lines.append(f"- Learning mode: **{learning_mode}**")
lines.append(f"- Cognitive state: {describe_cognitive_state(cognitive_state)}")
if weaknesses:
lines.append("- Recent difficulties (last 3):")
for w in weaknesses[-3:]:
lines.append(f" - {w}")
else:
lines.append("- Recent difficulties: *(none yet)*")
return "\n".join(lines)
# ---------- Same Question Check helpers ----------
def _normalize_text(text: str) -> str:
"""
将文本转为小写、去除标点和多余空格,用于简单相似度计算。
"""
text = text.lower().strip()
text = re.sub(r"[^\w\s]", " ", text)
text = re.sub(r"\s+", " ", text)
return text
def _jaccard_similarity(a: str, b: str) -> float:
tokens_a = set(a.split())
tokens_b = set(b.split())
if not tokens_a or not tokens_b:
return 0.0
return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)
def cosine_similarity(a: List[float], b: List[float]) -> float:
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
norm_a = math.sqrt(sum(x * x for x in a))
norm_b = math.sqrt(sum(y * y for y in b))
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
@traceable(run_type="embedding", name="get_embedding")
def get_embedding(text: str) -> Optional[List[float]]:
"""
调用 OpenAI Embedding API,将文本编码为向量。
"""
try:
resp = client.embeddings.create(
model=EMBEDDING_MODEL,
input=[text],
)
return resp.data[0].embedding
except Exception as e:
# 打到 Space 的 log,便于排查
print(f"[Embedding error] {repr(e)}")
return None
def find_similar_past_question(
message: str,
history: List[Tuple[str, str]],
jaccard_threshold: float = 0.65,
embedding_threshold: float = 0.85,
max_turns_to_check: int = 6,
) -> Optional[Tuple[str, str, float]]:
"""
在最近若干轮历史对话中查找与当前问题相似的既往问题。
两级检测:先 Jaccard,再 Embedding。
返回 (past_question, past_answer, similarity_score) 或 None
"""
norm_msg = _normalize_text(message)
if not norm_msg:
return None
# 1) Jaccard
best_sim_j = 0.0
best_pair_j: Optional[Tuple[str, str]] = None
checked = 0
for user_q, assistant_a in reversed(history):
checked += 1
if checked > max_turns_to_check:
break
norm_hist_q = _normalize_text(user_q)
if not norm_hist_q:
continue
if norm_msg == norm_hist_q:
return user_q, assistant_a, 1.0
sim_j = _jaccard_similarity(norm_msg, norm_hist_q)
if sim_j > best_sim_j:
best_sim_j = sim_j
best_pair_j = (user_q, assistant_a)
if best_pair_j and best_sim_j >= jaccard_threshold:
return best_pair_j[0], best_pair_j[1], best_sim_j
# 2) Embedding 语义相似度
if not history:
return None
msg_emb = get_embedding(message)
if msg_emb is None:
return None
best_sim_e = 0.0
best_pair_e: Optional[Tuple[str, str]] = None
checked = 0
for user_q, assistant_a in reversed(history):
checked += 1
if checked > max_turns_to_check:
break
hist_emb = get_embedding(user_q)
if hist_emb is None:
continue
sim_e = cosine_similarity(msg_emb, hist_emb)
if sim_e > best_sim_e:
best_sim_e = sim_e
best_pair_e = (user_q, assistant_a)
if best_pair_e and best_sim_e >= embedding_threshold:
return best_pair_e[0], best_pair_e[1], best_sim_e
return None
@traceable(run_type="llm", name="safe_chat_completion")
def safe_chat_completion(
model_name: str,
messages: List[Dict[str, str]],
lang: str,
op: str = "chat",
temperature: float = 0.5,
) -> str:
"""
统一安全调用 OpenAI Chat Completion:
- 最多尝试 2 次
- 每次请求 timeout = 20 秒
- 第一次用学生选择的模型;出错后,如果不是 DEFAULT_MODEL,则自动回退到 DEFAULT_MODEL 再试一次
- 所有异常都会打印到后台 log,但对学生只返回友好的中/英文错误文案
"""
preferred_model = model_name or DEFAULT_MODEL
last_error: Optional[Exception] = None
for attempt in range(2):
# 第一次用学生指定模型,第二次(如果需要)切到默认模型
if attempt == 0:
current_model = preferred_model
else:
current_model = DEFAULT_MODEL
try:
resp = client.chat.completions.create(
model=current_model,
messages=messages,
temperature=temperature,
timeout=20, # 20 秒超时
)
return resp.choices[0].message.content
except Exception as e:
print(
f"[safe_chat_completion][{op}] attempt {attempt+1} "
f"failed with model={current_model}: {repr(e)}"
)
last_error = e
# 如果已经用的是默认模型,或者已经是第二次尝试,就跳出循环
if current_model == DEFAULT_MODEL or attempt == 1:
break
# 两次都失败,返回友好的错误文案
return build_error_message(last_error or Exception("unknown error"), lang, op)
# ---------- 构建 messages ----------
def build_messages(
user_message: str,
history: List[Tuple[str, str]],
language_preference: str,
learning_mode: str,
doc_type: str,
course_outline: Optional[List[str]],
weaknesses: Optional[List[str]],
cognitive_state: Optional[Dict[str, int]],
rag_context: Optional[str] = None, # 新增:RAG 检索结果
) -> List[Dict[str, str]]:
messages: List[Dict[str, str]] = [
{"role": "system", "content": CLARE_SYSTEM_PROMPT}
]
# 学习模式
if learning_mode in LEARNING_MODE_INSTRUCTIONS:
mode_instruction = LEARNING_MODE_INSTRUCTIONS[learning_mode]
messages.append(
{
"role": "system",
"content": f"Current learning mode: {learning_mode}. {mode_instruction}",
}
)
# 课程大纲
topics = course_outline if course_outline else DEFAULT_COURSE_TOPICS
topics_text = " | ".join(topics)
messages.append(
{
"role": "system",
"content": (
"Here is the course syllabus context. Use this to stay aligned "
"with the course topics when answering: "
+ topics_text
),
}
)
# 上传文件类型提示
if doc_type and doc_type != "Syllabus":
messages.append(
{
"role": "system",
"content": (
f"The student also uploaded a {doc_type} document as supporting material. "
"You do not see the full content directly, but you may assume it is relevant "
"to the same course and topics."
),
}
)
# 学生弱项提示
if weaknesses:
weak_text = " | ".join(weaknesses[-5:])
messages.append(
{
"role": "system",
"content": (
"The student seems to struggle with the following questions or topics. "
"Be extra gentle and clear when these appear: " + weak_text
),
}
)
# 认知状态提示
if cognitive_state:
confusion = cognitive_state.get("confusion", 0)
mastery = cognitive_state.get("mastery", 0)
if confusion >= 2 and confusion >= mastery + 1:
messages.append(
{
"role": "system",
"content": (
"The student is currently under HIGH cognitive load. "
"Use simpler language, shorter steps, and more concrete examples. "
"Avoid long derivations in a single answer, and check understanding "
"frequently."
),
}
)
elif mastery >= 2 and mastery >= confusion + 1:
messages.append(
{
"role": "system",
"content": (
"The student seems comfortable with the material. "
"You may increase difficulty slightly, introduce deeper follow-up "
"questions, and connect concepts across topics."
),
}
)
else:
messages.append(
{
"role": "system",
"content": (
"The student's cognitive state is mixed or uncertain. "
"Keep explanations clear and moderately paced, and probe for "
"understanding with short questions."
),
}
)
# 语言偏好控制
if language_preference == "English":
messages.append(
{"role": "system", "content": "Please answer in English."}
)
elif language_preference == "中文":
messages.append(
{"role": "system", "content": "请用中文回答学生的问题。"}
)
# Session 内记忆摘要
session_memory_text = build_session_memory_summary(
history=history,
weaknesses=weaknesses,
cognitive_state=cognitive_state,
)
messages.append(
{
"role": "system",
"content": (
"Here is a short summary of this session's memory (only within the current chat; "
"it is not persisted across sessions). Use it to stay consistent with the "
"student's previous questions, difficulties, and cognitive state: "
+ session_memory_text
),
}
)
# RAG 检索结果
if rag_context:
messages.append(
{
"role": "system",
"content": (
"Here are some relevant excerpts from the course materials. "
"Use them as the primary factual grounding when answering the student's question. "
"If there is any conflict between these excerpts and your prior knowledge, "
"prefer the excerpts.\n\n"
+ rag_context
),
}
)
# 历史对话
for user, assistant in history:
messages.append({"role": "user", "content": user})
if assistant is not None:
messages.append({"role": "assistant", "content": assistant})
# 当前输入
messages.append({"role": "user", "content": user_message})
return messages
# 装饰器
@traceable(run_type="chain", name="chat_with_clare")
def chat_with_clare(
message: str,
history: List[Tuple[str, str]],
model_name: str,
language_preference: str,
learning_mode: str,
doc_type: str,
course_outline: Optional[List[str]],
weaknesses: Optional[List[str]],
cognitive_state: Optional[Dict[str, int]],
rag_context: Optional[str] = None,
) -> Tuple[str, List[Tuple[str, str]]]:
try:
set_run_metadata(
learning_mode=learning_mode,
language_preference=language_preference,
doc_type=doc_type,
)
except Exception as e:
print(f"[LangSmith metadata error in chat_with_clare] {repr(e)}")
# 构建 messages
messages = build_messages(
user_message=message,
history=history,
language_preference=language_preference,
learning_mode=learning_mode,
doc_type=doc_type,
course_outline=course_outline,
weaknesses=weaknesses,
cognitive_state=cognitive_state,
rag_context=rag_context,
)
# 统一安全调用
answer = safe_chat_completion(
model_name=model_name,
messages=messages,
lang=language_preference,
op="chat",
temperature=0.5,
)
history = history + [(message, answer)]
return answer, history
# ---------- 导出对话为 Markdown ----------
def export_conversation(
history: List[Tuple[str, str]],
course_outline: List[str],
learning_mode_val: str,
weaknesses: List[str],
cognitive_state: Optional[Dict[str, int]],
) -> str:
lines: List[str] = []
lines.append("# Clare – Conversation Export\n")
lines.append(f"- Learning mode: **{learning_mode_val}**\n")
lines.append("- Course topics (short): " + "; ".join(course_outline[:5]) + "\n")
lines.append(f"- Cognitive state snapshot: {describe_cognitive_state(cognitive_state)}\n")
if weaknesses:
lines.append("- Observed student difficulties:\n")
for w in weaknesses[-5:]:
lines.append(f" - {w}\n")
lines.append("\n---\n\n")
for user, assistant in history:
lines.append(f"**Student:** {user}\n\n")
lines.append(f"**Clare:** {assistant}\n\n")
lines.append("---\n\n")
return "".join(lines)
# ---------- 生成 3 个 quiz 题目 ----------
from langsmith import traceable
@traceable(run_type="chain", name="generate_quiz_from_history")
def generate_quiz_from_history(
history: List[Tuple[str, str]],
course_outline: List[str],
weaknesses: List[str],
cognitive_state: Optional[Dict[str, int]],
model_name: str,
language_preference: str,
) -> str:
conversation_text = ""
for user, assistant in history[-8:]:
conversation_text += f"Student: {user}\nClare: {assistant}\n"
topics_text = "; ".join(course_outline[:8])
weakness_text = "; ".join(weaknesses[-5:]) if weaknesses else "N/A"
cog_text = describe_cognitive_state(cognitive_state)
messages = [
{"role": "system", "content": CLARE_SYSTEM_PROMPT},
{
"role": "system",
"content": (
"Now your task is to create a **short concept quiz** for the student. "
"Based on the conversation and course topics, generate **3 questions** "
"(a mix of multiple-choice and short-answer is fine). After listing the "
"questions, provide an answer key at the end under a heading 'Answer Key'. "
"Number the questions Q1, Q2, Q3. Adjust the difficulty according to the "
"student's cognitive state."
),
},
{
"role": "system",
"content": f"Course topics: {topics_text}",
},
{
"role": "system",
"content": f"Student known difficulties: {weakness_text}",
},
{
"role": "system",
"content": f"Student cognitive state: {cog_text}",
},
{
"role": "user",
"content": (
"Here is the recent conversation between you and the student:\n\n"
+ conversation_text
+ "\n\nPlease create the quiz now."
),
},
]
if language_preference == "中文":
messages.append(
{
"role": "system",
"content": "请用中文给出问题和答案。",
}
)
quiz_text = safe_chat_completion(
model_name=model_name,
messages=messages,
lang=language_preference,
op="quiz",
temperature=0.5,
)
return quiz_text
# ---------- 概念总结(知识点摘要) ----------
@traceable(run_type="chain", name="summarize_conversation")
def summarize_conversation(
history: List[Tuple[str, str]],
course_outline: List[str],
weaknesses: List[str],
cognitive_state: Optional[Dict[str, int]],
model_name: str,
language_preference: str,
) -> str:
conversation_text = ""
for user, assistant in history[-10:]:
conversation_text += f"Student: {user}\nClare: {assistant}\n"
topics_text = "; ".join(course_outline[:8])
weakness_text = "; ".join(weaknesses[-5:]) if weaknesses else "N/A"
cog_text = describe_cognitive_state(cognitive_state)
messages = [
{"role": "system", "content": CLARE_SYSTEM_PROMPT},
{
"role": "system",
"content": (
"Your task now is to produce a **concept-only summary** of this tutoring "
"session. Only include knowledge points, definitions, key formulas, "
"examples, and main takeaways. Do **not** include any personal remarks, "
"jokes, or off-topic chat. Write in clear bullet points. This summary "
"should be suitable for the student to paste into their study notes. "
"Take into account what the student struggled with and their cognitive state."
),
},
{
"role": "system",
"content": f"Course topics context: {topics_text}",
},
{
"role": "system",
"content": f"Student known difficulties: {weakness_text}",
},
{
"role": "system",
"content": f"Student cognitive state: {cog_text}",
},
{
"role": "user",
"content": (
"Here is the recent conversation between you and the student:\n\n"
+ conversation_text
+ "\n\nPlease summarize only the concepts and key ideas learned."
),
},
]
if language_preference == "中文":
messages.append(
{
"role": "system",
"content": "请用中文给出要点总结,只保留知识点和结论,使用条目符号。"
}
)
summary_text = safe_chat_completion(
model_name=model_name,
messages=messages,
lang=language_preference,
op="summary",
temperature=0.4,
)
return summary_text