test_AI_Agent

Sleeping

App Files Files Community

test_AI_Agent / api /clare_core.py

SarahXia0405

Update api/clare_core.py

0e36683 verified about 1 month ago

raw

history blame contribute delete

25.1 kB

	# clare_core.py
	import re
	import math
	from typing import List, Dict, Tuple, Optional

	from docx import Document

	from .config import (
	client,
	DEFAULT_MODEL,
	EMBEDDING_MODEL,
	DEFAULT_COURSE_TOPICS,
	CLARE_SYSTEM_PROMPT,
	LEARNING_MODE_INSTRUCTIONS,
	)
	from langsmith import traceable
	from langsmith.run_helpers import set_run_metadata



	# ---------- syllabus 解析 ----------
	def parse_syllabus_docx(file_path: str, max_lines: int = 15) -> List[str]:
	"""
	非常简单的 syllabus 解析：取前若干个非空段落当作主题行。
	只是为了给 Clare 一些课程上下文，不追求超精确结构。
	"""
	topics: List[str] = []
	try:
	doc = Document(file_path)
	for para in doc.paragraphs:
	text = para.text.strip()
	if not text:
	continue
	topics.append(text)
	if len(topics) >= max_lines:
	break
	except Exception as e:
	topics = [f"[Error parsing syllabus: {e}]"]

	return topics


	# ---------- 简单“弱项”检测 ----------
	WEAKNESS_KEYWORDS = [
	"don't understand",
	"do not understand",
	"not understand",
	"not sure",
	"confused",
	"hard to",
	"difficult",
	"struggle",
	"不会",
	"不懂",
	"看不懂",
	"搞不清",
	"很难",
	]

	# ---------- 简单“掌握”检测 ----------
	MASTERY_KEYWORDS = [
	"got it",
	"makes sense",
	"now i see",
	"i see",
	"understand now",
	"clear now",
	"easy",
	"no problem",
	"没问题",
	"懂了",
	"明白了",
	"清楚了",
	]


	def update_weaknesses_from_message(message: str, weaknesses: List[str]) -> List[str]:
	lower_msg = message.lower()
	if any(k in lower_msg for k in WEAKNESS_KEYWORDS):
	weaknesses = weaknesses or []
	weaknesses.append(message)
	return weaknesses


	def update_cognitive_state_from_message(
	message: str,
	state: Optional[Dict[str, int]],
	) -> Dict[str, int]:
	"""
	简单认知状态统计：
	- 遇到困惑类关键词 → confusion +1
	- 遇到掌握类关键词 → mastery +1
	"""
	if state is None:
	state = {"confusion": 0, "mastery": 0}

	lower_msg = message.lower()
	if any(k in lower_msg for k in WEAKNESS_KEYWORDS):
	state["confusion"] = state.get("confusion", 0) + 1
	if any(k in lower_msg for k in MASTERY_KEYWORDS):
	state["mastery"] = state.get("mastery", 0) + 1
	return state


	def describe_cognitive_state(state: Optional[Dict[str, int]]) -> str:
	if not state:
	return "unknown"
	confusion = state.get("confusion", 0)
	mastery = state.get("mastery", 0)
	if confusion >= 2 and confusion >= mastery + 1:
	return "student shows signs of HIGH cognitive load (often confused)."
	elif mastery >= 2 and mastery >= confusion + 1:
	return "student seems COMFORTABLE; material may be slightly easy."
	else:
	return "mixed or uncertain cognitive state."


	# ---------- Session Memory ----------
	def build_session_memory_summary(
	history: List[Tuple[str, str]],
	weaknesses: Optional[List[str]],
	cognitive_state: Optional[Dict[str, int]],
	max_questions: int = 4,
	max_weaknesses: int = 3,
	) -> str:
	"""
	只在本次会话内使用的“记忆摘要”：
	- 最近几条学生提问
	- 最近几条学生觉得难的问题
	- 当前的认知状态描述
	"""
	parts: List[str] = []

	# 最近几条提问（只取 student）
	if history:
	recent_qs = [u for (u, _a) in history[-max_questions:]]
	trimmed_qs = []
	for q in recent_qs:
	q = q.strip()
	if len(q) > 120:
	q = q[:117] + "..."
	trimmed_qs.append(q)
	if trimmed_qs:
	parts.append("Recent student questions: " + " \| ".join(trimmed_qs))

	# 最近几条“弱项”
	if weaknesses:
	recent_weak = weaknesses[-max_weaknesses:]
	trimmed_weak = []
	for w in recent_weak:
	w = w.strip()
	if len(w) > 120:
	w = w[:117] + "..."
	trimmed_weak.append(w)
	parts.append("Recent difficulties mentioned by the student: " + " \| ".join(trimmed_weak))

	# 当前认知状态
	if cognitive_state:
	parts.append("Current cognitive state: " + describe_cognitive_state(cognitive_state))

	if not parts:
	return (
	"No prior session memory. You can treat this as an early stage of the conversation; "
	"start with simple explanations and ask a quick check-up question."
	)

	return " \| ".join(parts)


	# ---------- 语言检测（用于 Auto 模式） ----------
	def detect_language(message: str, preference: str) -> str:
	"""
	preference:
	- 'English' → 强制英文
	- '中文' → 强制中文
	- 'Auto' → 检测文本是否包含中文字符
	"""
	if preference in ("English", "中文"):
	return preference
	# Auto 模式下简单检测是否含有中文字符
	if re.search(r"[\u4e00-\u9fff]", message):
	return "中文"
	return "English"


	def get_empty_input_prompt(lang: str) -> str:
	"""
	空输入时的友好提示，根据语言返回中/英文。
	"""
	if lang == "中文":
	return "请先输入一个问题或想法，再按回车发送，我才能帮到你哦。"
	# 默认英文
	return "Please type a question or some text before sending, then hit Enter."


	def build_error_message(
	e: Exception,
	lang: str,
	op: str = "chat",
	) -> str:
	"""
	针对不同操作类型（普通对话 / quiz / summary）和语言，生成友好的错误提示。
	不把原始异常直接暴露给学生，只在后台打印。
	"""
	if lang == "中文":
	prefix = {
	"chat": "抱歉，刚刚在和模型对话时出现了一点问题。",
	"quiz": "抱歉，刚刚在生成测验题目时出现了一点问题。",
	"summary": "抱歉，刚刚在生成总结时出现了一点问题。",
	}.get(op, "抱歉，刚刚出现了一点问题。")
	return prefix + " 请稍后再试一次，或者换个问法试试。"

	# 默认英文
	prefix_en = {
	"chat": "Sorry, I ran into a problem while talking to the model.",
	"quiz": "Sorry, there was a problem while generating the quiz.",
	"summary": "Sorry, there was a problem while generating the summary.",
	}.get(op, "Sorry, something went wrong just now.")
	return prefix_en + " Please try again in a moment or rephrase your request."


	# ---------- Session 状态展示 ----------
	def render_session_status(
	learning_mode: str,
	weaknesses: Optional[List[str]],
	cognitive_state: Optional[Dict[str, int]],
	) -> str:
	lines: List[str] = []
	lines.append("### Session status\n")
	lines.append(f"- Learning mode: {learning_mode}")
	lines.append(f"- Cognitive state: {describe_cognitive_state(cognitive_state)}")

	if weaknesses:
	lines.append("- Recent difficulties (last 3):")
	for w in weaknesses[-3:]:
	lines.append(f" - {w}")
	else:
	lines.append("- Recent difficulties: (none yet)")

	return "\n".join(lines)


	# ---------- Same Question Check helpers ----------
	def _normalize_text(text: str) -> str:
	"""
	将文本转为小写、去除标点和多余空格，用于简单相似度计算。
	"""
	text = text.lower().strip()
	text = re.sub(r"[^\w\s]", " ", text)
	text = re.sub(r"\s+", " ", text)
	return text


	def _jaccard_similarity(a: str, b: str) -> float:
	tokens_a = set(a.split())
	tokens_b = set(b.split())
	if not tokens_a or not tokens_b:
	return 0.0
	return len(tokens_a & tokens_b) / len(tokens_a \| tokens_b)


	def cosine_similarity(a: List[float], b: List[float]) -> float:
	if not a or not b or len(a) != len(b):
	return 0.0
	dot = sum(x * y for x, y in zip(a, b))
	norm_a = math.sqrt(sum(x * x for x in a))
	norm_b = math.sqrt(sum(y * y for y in b))
	if norm_a == 0 or norm_b == 0:
	return 0.0
	return dot / (norm_a * norm_b)

	@traceable(run_type="embedding", name="get_embedding")
	def get_embedding(text: str) -> Optional[List[float]]:
	"""
	调用 OpenAI Embedding API，将文本编码为向量。
	"""
	try:
	resp = client.embeddings.create(
	model=EMBEDDING_MODEL,
	input=[text],
	)
	return resp.data[0].embedding
	except Exception as e:
	# 打到 Space 的 log，便于排查
	print(f"[Embedding error] {repr(e)}")
	return None


	def find_similar_past_question(
	message: str,
	history: List[Tuple[str, str]],
	jaccard_threshold: float = 0.65,
	embedding_threshold: float = 0.85,
	max_turns_to_check: int = 6,
	) -> Optional[Tuple[str, str, float]]:
	"""
	在最近若干轮历史对话中查找与当前问题相似的既往问题。
	两级检测：先 Jaccard，再 Embedding。
	返回 (past_question, past_answer, similarity_score) 或 None
	"""
	norm_msg = _normalize_text(message)
	if not norm_msg:
	return None

	# 1) Jaccard
	best_sim_j = 0.0
	best_pair_j: Optional[Tuple[str, str]] = None
	checked = 0

	for user_q, assistant_a in reversed(history):
	checked += 1
	if checked > max_turns_to_check:
	break

	norm_hist_q = _normalize_text(user_q)
	if not norm_hist_q:
	continue

	if norm_msg == norm_hist_q:
	return user_q, assistant_a, 1.0

	sim_j = _jaccard_similarity(norm_msg, norm_hist_q)
	if sim_j > best_sim_j:
	best_sim_j = sim_j
	best_pair_j = (user_q, assistant_a)

	if best_pair_j and best_sim_j >= jaccard_threshold:
	return best_pair_j[0], best_pair_j[1], best_sim_j

	# 2) Embedding 语义相似度
	if not history:
	return None

	msg_emb = get_embedding(message)
	if msg_emb is None:
	return None

	best_sim_e = 0.0
	best_pair_e: Optional[Tuple[str, str]] = None
	checked = 0

	for user_q, assistant_a in reversed(history):
	checked += 1
	if checked > max_turns_to_check:
	break

	hist_emb = get_embedding(user_q)
	if hist_emb is None:
	continue

	sim_e = cosine_similarity(msg_emb, hist_emb)
	if sim_e > best_sim_e:
	best_sim_e = sim_e
	best_pair_e = (user_q, assistant_a)

	if best_pair_e and best_sim_e >= embedding_threshold:
	return best_pair_e[0], best_pair_e[1], best_sim_e

	return None


	@traceable(run_type="llm", name="safe_chat_completion")
	def safe_chat_completion(
	model_name: str,
	messages: List[Dict[str, str]],
	lang: str,
	op: str = "chat",
	temperature: float = 0.5,
	) -> str:
	"""
	统一安全调用 OpenAI Chat Completion：
	- 最多尝试 2 次
	- 每次请求 timeout = 20 秒
	- 第一次用学生选择的模型；出错后，如果不是 DEFAULT_MODEL，则自动回退到 DEFAULT_MODEL 再试一次
	- 所有异常都会打印到后台 log，但对学生只返回友好的中/英文错误文案
	"""
	preferred_model = model_name or DEFAULT_MODEL
	last_error: Optional[Exception] = None

	for attempt in range(2):
	# 第一次用学生指定模型，第二次（如果需要）切到默认模型
	if attempt == 0:
	current_model = preferred_model
	else:
	current_model = DEFAULT_MODEL

	try:
	resp = client.chat.completions.create(
	model=current_model,
	messages=messages,
	temperature=temperature,
	timeout=20, # 20 秒超时
	)
	return resp.choices[0].message.content
	except Exception as e:
	print(
	f"[safe_chat_completion][{op}] attempt {attempt+1} "
	f"failed with model={current_model}: {repr(e)}"
	)
	last_error = e

	# 如果已经用的是默认模型，或者已经是第二次尝试，就跳出循环
	if current_model == DEFAULT_MODEL or attempt == 1:
	break

	# 两次都失败，返回友好的错误文案
	return build_error_message(last_error or Exception("unknown error"), lang, op)


	# ---------- 构建 messages ----------
	def build_messages(
	user_message: str,
	history: List[Tuple[str, str]],
	language_preference: str,
	learning_mode: str,
	doc_type: str,
	course_outline: Optional[List[str]],
	weaknesses: Optional[List[str]],
	cognitive_state: Optional[Dict[str, int]],
	rag_context: Optional[str] = None, # 新增：RAG 检索结果
	) -> List[Dict[str, str]]:
	messages: List[Dict[str, str]] = [
	{"role": "system", "content": CLARE_SYSTEM_PROMPT}
	]

	# 学习模式
	if learning_mode in LEARNING_MODE_INSTRUCTIONS:
	mode_instruction = LEARNING_MODE_INSTRUCTIONS[learning_mode]
	messages.append(
	{
	"role": "system",
	"content": f"Current learning mode: {learning_mode}. {mode_instruction}",
	}
	)

	# 课程大纲
	topics = course_outline if course_outline else DEFAULT_COURSE_TOPICS
	topics_text = " \| ".join(topics)
	messages.append(
	{
	"role": "system",
	"content": (
	"Here is the course syllabus context. Use this to stay aligned "
	"with the course topics when answering: "
	+ topics_text
	),
	}
	)

	# 上传文件类型提示
	if doc_type and doc_type != "Syllabus":
	messages.append(
	{
	"role": "system",
	"content": (
	f"The student also uploaded a {doc_type} document as supporting material. "
	"You do not see the full content directly, but you may assume it is relevant "
	"to the same course and topics."
	),
	}
	)

	# 学生弱项提示
	if weaknesses:
	weak_text = " \| ".join(weaknesses[-5:])
	messages.append(
	{
	"role": "system",
	"content": (
	"The student seems to struggle with the following questions or topics. "
	"Be extra gentle and clear when these appear: " + weak_text
	),
	}
	)

	# 认知状态提示
	if cognitive_state:
	confusion = cognitive_state.get("confusion", 0)
	mastery = cognitive_state.get("mastery", 0)
	if confusion >= 2 and confusion >= mastery + 1:
	messages.append(
	{
	"role": "system",
	"content": (
	"The student is currently under HIGH cognitive load. "
	"Use simpler language, shorter steps, and more concrete examples. "
	"Avoid long derivations in a single answer, and check understanding "
	"frequently."
	),
	}
	)
	elif mastery >= 2 and mastery >= confusion + 1:
	messages.append(
	{
	"role": "system",
	"content": (
	"The student seems comfortable with the material. "
	"You may increase difficulty slightly, introduce deeper follow-up "
	"questions, and connect concepts across topics."
	),
	}
	)
	else:
	messages.append(
	{
	"role": "system",
	"content": (
	"The student's cognitive state is mixed or uncertain. "
	"Keep explanations clear and moderately paced, and probe for "
	"understanding with short questions."
	),
	}
	)

	# 语言偏好控制
	if language_preference == "English":
	messages.append(
	{"role": "system", "content": "Please answer in English."}
	)
	elif language_preference == "中文":
	messages.append(
	{"role": "system", "content": "请用中文回答学生的问题。"}
	)

	# Session 内记忆摘要
	session_memory_text = build_session_memory_summary(
	history=history,
	weaknesses=weaknesses,
	cognitive_state=cognitive_state,
	)
	messages.append(
	{
	"role": "system",
	"content": (
	"Here is a short summary of this session's memory (only within the current chat; "
	"it is not persisted across sessions). Use it to stay consistent with the "
	"student's previous questions, difficulties, and cognitive state: "
	+ session_memory_text
	),
	}
	)

	# RAG 检索结果
	if rag_context:
	messages.append(
	{
	"role": "system",
	"content": (
	"Here are some relevant excerpts from the course materials. "
	"Use them as the primary factual grounding when answering the student's question. "
	"If there is any conflict between these excerpts and your prior knowledge, "
	"prefer the excerpts.\n\n"
	+ rag_context
	),
	}
	)

	# 历史对话
	for user, assistant in history:
	messages.append({"role": "user", "content": user})
	if assistant is not None:
	messages.append({"role": "assistant", "content": assistant})

	# 当前输入
	messages.append({"role": "user", "content": user_message})
	return messages

	# 装饰器
	@traceable(run_type="chain", name="chat_with_clare")
	def chat_with_clare(
	message: str,
	history: List[Tuple[str, str]],
	model_name: str,
	language_preference: str,
	learning_mode: str,
	doc_type: str,
	course_outline: Optional[List[str]],
	weaknesses: Optional[List[str]],
	cognitive_state: Optional[Dict[str, int]],
	rag_context: Optional[str] = None,
	) -> Tuple[str, List[Tuple[str, str]]]:
	try:
	set_run_metadata(
	learning_mode=learning_mode,
	language_preference=language_preference,
	doc_type=doc_type,
	)
	except Exception as e:
	print(f"[LangSmith metadata error in chat_with_clare] {repr(e)}")


	# 构建 messages
	messages = build_messages(
	user_message=message,
	history=history,
	language_preference=language_preference,
	learning_mode=learning_mode,
	doc_type=doc_type,
	course_outline=course_outline,
	weaknesses=weaknesses,
	cognitive_state=cognitive_state,
	rag_context=rag_context,
	)

	# 统一安全调用
	answer = safe_chat_completion(
	model_name=model_name,
	messages=messages,
	lang=language_preference,
	op="chat",
	temperature=0.5,
	)

	history = history + [(message, answer)]
	return answer, history


	# ---------- 导出对话为 Markdown ----------
	def export_conversation(
	history: List[Tuple[str, str]],
	course_outline: List[str],
	learning_mode_val: str,
	weaknesses: List[str],
	cognitive_state: Optional[Dict[str, int]],
	) -> str:
	lines: List[str] = []
	lines.append("# Clare – Conversation Export\n")
	lines.append(f"- Learning mode: {learning_mode_val}\n")
	lines.append("- Course topics (short): " + "; ".join(course_outline[:5]) + "\n")
	lines.append(f"- Cognitive state snapshot: {describe_cognitive_state(cognitive_state)}\n")

	if weaknesses:
	lines.append("- Observed student difficulties:\n")
	for w in weaknesses[-5:]:
	lines.append(f" - {w}\n")
	lines.append("\n---\n\n")

	for user, assistant in history:
	lines.append(f"Student: {user}\n\n")
	lines.append(f"Clare: {assistant}\n\n")
	lines.append("---\n\n")

	return "".join(lines)


	# ---------- 生成 3 个 quiz 题目 ----------
	from langsmith import traceable

	@traceable(run_type="chain", name="generate_quiz_from_history")
	def generate_quiz_from_history(
	history: List[Tuple[str, str]],
	course_outline: List[str],
	weaknesses: List[str],
	cognitive_state: Optional[Dict[str, int]],
	model_name: str,
	language_preference: str,
	) -> str:
	conversation_text = ""
	for user, assistant in history[-8:]:
	conversation_text += f"Student: {user}\nClare: {assistant}\n"

	topics_text = "; ".join(course_outline[:8])
	weakness_text = "; ".join(weaknesses[-5:]) if weaknesses else "N/A"
	cog_text = describe_cognitive_state(cognitive_state)

	messages = [
	{"role": "system", "content": CLARE_SYSTEM_PROMPT},
	{
	"role": "system",
	"content": (
	"Now your task is to create a short concept quiz for the student. "
	"Based on the conversation and course topics, generate 3 questions "
	"(a mix of multiple-choice and short-answer is fine). After listing the "
	"questions, provide an answer key at the end under a heading 'Answer Key'. "
	"Number the questions Q1, Q2, Q3. Adjust the difficulty according to the "
	"student's cognitive state."
	),
	},
	{
	"role": "system",
	"content": f"Course topics: {topics_text}",
	},
	{
	"role": "system",
	"content": f"Student known difficulties: {weakness_text}",
	},
	{
	"role": "system",
	"content": f"Student cognitive state: {cog_text}",
	},
	{
	"role": "user",
	"content": (
	"Here is the recent conversation between you and the student:\n\n"
	+ conversation_text
	+ "\n\nPlease create the quiz now."
	),
	},
	]

	if language_preference == "中文":
	messages.append(
	{
	"role": "system",
	"content": "请用中文给出问题和答案。",
	}
	)

	quiz_text = safe_chat_completion(
	model_name=model_name,
	messages=messages,
	lang=language_preference,
	op="quiz",
	temperature=0.5,
	)
	return quiz_text


	# ---------- 概念总结（知识点摘要） ----------
	@traceable(run_type="chain", name="summarize_conversation")
	def summarize_conversation(
	history: List[Tuple[str, str]],
	course_outline: List[str],
	weaknesses: List[str],
	cognitive_state: Optional[Dict[str, int]],
	model_name: str,
	language_preference: str,
	) -> str:
	conversation_text = ""
	for user, assistant in history[-10:]:
	conversation_text += f"Student: {user}\nClare: {assistant}\n"

	topics_text = "; ".join(course_outline[:8])
	weakness_text = "; ".join(weaknesses[-5:]) if weaknesses else "N/A"
	cog_text = describe_cognitive_state(cognitive_state)

	messages = [
	{"role": "system", "content": CLARE_SYSTEM_PROMPT},
	{
	"role": "system",
	"content": (
	"Your task now is to produce a concept-only summary of this tutoring "
	"session. Only include knowledge points, definitions, key formulas, "
	"examples, and main takeaways. Do not include any personal remarks, "
	"jokes, or off-topic chat. Write in clear bullet points. This summary "
	"should be suitable for the student to paste into their study notes. "
	"Take into account what the student struggled with and their cognitive state."
	),
	},
	{
	"role": "system",
	"content": f"Course topics context: {topics_text}",
	},
	{
	"role": "system",
	"content": f"Student known difficulties: {weakness_text}",
	},
	{
	"role": "system",
	"content": f"Student cognitive state: {cog_text}",
	},
	{
	"role": "user",
	"content": (
	"Here is the recent conversation between you and the student:\n\n"
	+ conversation_text
	+ "\n\nPlease summarize only the concepts and key ideas learned."
	),
	},
	]

	if language_preference == "中文":
	messages.append(
	{
	"role": "system",
	"content": "请用中文给出要点总结，只保留知识点和结论，使用条目符号。"
	}
	)

	summary_text = safe_chat_completion(
	model_name=model_name,
	messages=messages,
	lang=language_preference,
	op="summary",
	temperature=0.4,
	)
	return summary_text