test_AI_Agent

Sleeping

File size: 25,132 Bytes

# clare_core.py
import re
import math
from typing import List, Dict, Tuple, Optional

from docx import Document

from .config import (
    client,
    DEFAULT_MODEL,
    EMBEDDING_MODEL,
    DEFAULT_COURSE_TOPICS,
    CLARE_SYSTEM_PROMPT,
    LEARNING_MODE_INSTRUCTIONS,
)
from langsmith import traceable
from langsmith.run_helpers import set_run_metadata



# ---------- syllabus 解析 ----------
def parse_syllabus_docx(file_path: str, max_lines: int = 15) -> List[str]:
    """
    非常简单的 syllabus 解析：取前若干个非空段落当作主题行。
    只是为了给 Clare 一些课程上下文，不追求超精确结构。
    """
    topics: List[str] = []
    try:
        doc = Document(file_path)
        for para in doc.paragraphs:
            text = para.text.strip()
            if not text:
                continue
            topics.append(text)
            if len(topics) >= max_lines:
                break
    except Exception as e:
        topics = [f"[Error parsing syllabus: {e}]"]

    return topics


# ---------- 简单“弱项”检测 ----------
WEAKNESS_KEYWORDS = [
    "don't understand",
    "do not understand",
    "not understand",
    "not sure",
    "confused",
    "hard to",
    "difficult",
    "struggle",
    "不会",
    "不懂",
    "看不懂",
    "搞不清",
    "很难",
]

# ---------- 简单“掌握”检测 ----------
MASTERY_KEYWORDS = [
    "got it",
    "makes sense",
    "now i see",
    "i see",
    "understand now",
    "clear now",
    "easy",
    "no problem",
    "没问题",
    "懂了",
    "明白了",
    "清楚了",
]


def update_weaknesses_from_message(message: str, weaknesses: List[str]) -> List[str]:
    lower_msg = message.lower()
    if any(k in lower_msg for k in WEAKNESS_KEYWORDS):
        weaknesses = weaknesses or []
        weaknesses.append(message)
    return weaknesses


def update_cognitive_state_from_message(
    message: str,
    state: Optional[Dict[str, int]],
) -> Dict[str, int]:
    """
    简单认知状态统计：
    - 遇到困惑类关键词 → confusion +1
    - 遇到掌握类关键词 → mastery +1
    """
    if state is None:
        state = {"confusion": 0, "mastery": 0}

    lower_msg = message.lower()
    if any(k in lower_msg for k in WEAKNESS_KEYWORDS):
        state["confusion"] = state.get("confusion", 0) + 1
    if any(k in lower_msg for k in MASTERY_KEYWORDS):
        state["mastery"] = state.get("mastery", 0) + 1
    return state


def describe_cognitive_state(state: Optional[Dict[str, int]]) -> str:
    if not state:
        return "unknown"
    confusion = state.get("confusion", 0)
    mastery = state.get("mastery", 0)
    if confusion >= 2 and confusion >= mastery + 1:
        return "student shows signs of HIGH cognitive load (often confused)."
    elif mastery >= 2 and mastery >= confusion + 1:
        return "student seems COMFORTABLE; material may be slightly easy."
    else:
        return "mixed or uncertain cognitive state."


# ---------- Session Memory ----------
def build_session_memory_summary(
    history: List[Tuple[str, str]],
    weaknesses: Optional[List[str]],
    cognitive_state: Optional[Dict[str, int]],
    max_questions: int = 4,
    max_weaknesses: int = 3,
) -> str:
    """
    只在本次会话内使用的“记忆摘要”：
    - 最近几条学生提问
    - 最近几条学生觉得难的问题
    - 当前的认知状态描述
    """
    parts: List[str] = []

    # 最近几条提问（只取 student）
    if history:
        recent_qs = [u for (u, _a) in history[-max_questions:]]
        trimmed_qs = []
        for q in recent_qs:
            q = q.strip()
            if len(q) > 120:
                q = q[:117] + "..."
            trimmed_qs.append(q)
        if trimmed_qs:
            parts.append("Recent student questions: " + " | ".join(trimmed_qs))

    # 最近几条“弱项”
    if weaknesses:
        recent_weak = weaknesses[-max_weaknesses:]
        trimmed_weak = []
        for w in recent_weak:
            w = w.strip()
            if len(w) > 120:
                w = w[:117] + "..."
            trimmed_weak.append(w)
        parts.append("Recent difficulties mentioned by the student: " + " | ".join(trimmed_weak))

    # 当前认知状态
    if cognitive_state:
        parts.append("Current cognitive state: " + describe_cognitive_state(cognitive_state))

    if not parts:
        return (
            "No prior session memory. You can treat this as an early stage of the conversation; "
            "start with simple explanations and ask a quick check-up question."
        )

    return " | ".join(parts)


# ---------- 语言检测（用于 Auto 模式） ----------
def detect_language(message: str, preference: str) -> str:
    """
    preference:
      - 'English' → 强制英文
      - '中文' → 强制中文
      - 'Auto' → 检测文本是否包含中文字符
    """
    if preference in ("English", "中文"):
        return preference
    # Auto 模式下简单检测是否含有中文字符
    if re.search(r"[\u4e00-\u9fff]", message):
        return "中文"
    return "English"


def get_empty_input_prompt(lang: str) -> str:
    """
    空输入时的友好提示，根据语言返回中/英文。
    """
    if lang == "中文":
        return "请先输入一个问题或想法，再按回车发送，我才能帮到你哦。"
    # 默认英文
    return "Please type a question or some text before sending, then hit Enter."


def build_error_message(
    e: Exception,
    lang: str,
    op: str = "chat",
) -> str:
    """
    针对不同操作类型（普通对话 / quiz / summary）和语言，生成友好的错误提示。
    不把原始异常直接暴露给学生，只在后台打印。
    """
    if lang == "中文":
        prefix = {
            "chat": "抱歉，刚刚在和模型对话时出现了一点问题。",
            "quiz": "抱歉，刚刚在生成测验题目时出现了一点问题。",
            "summary": "抱歉，刚刚在生成总结时出现了一点问题。",
        }.get(op, "抱歉，刚刚出现了一点问题。")
        return prefix + " 请稍后再试一次，或者换个问法试试。"

    # 默认英文
    prefix_en = {
        "chat": "Sorry, I ran into a problem while talking to the model.",
        "quiz": "Sorry, there was a problem while generating the quiz.",
        "summary": "Sorry, there was a problem while generating the summary.",
    }.get(op, "Sorry, something went wrong just now.")
    return prefix_en + " Please try again in a moment or rephrase your request."


# ---------- Session 状态展示 ----------
def render_session_status(
    learning_mode: str,
    weaknesses: Optional[List[str]],
    cognitive_state: Optional[Dict[str, int]],
) -> str:
    lines: List[str] = []
    lines.append("### Session status\n")
    lines.append(f"- Learning mode: **{learning_mode}**")
    lines.append(f"- Cognitive state: {describe_cognitive_state(cognitive_state)}")

    if weaknesses:
        lines.append("- Recent difficulties (last 3):")
        for w in weaknesses[-3:]:
            lines.append(f"  - {w}")
    else:
        lines.append("- Recent difficulties: *(none yet)*")

    return "\n".join(lines)


# ---------- Same Question Check helpers ----------
def _normalize_text(text: str) -> str:
    """
    将文本转为小写、去除标点和多余空格，用于简单相似度计算。
    """
    text = text.lower().strip()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text


def _jaccard_similarity(a: str, b: str) -> float:
    tokens_a = set(a.split())
    tokens_b = set(b.split())
    if not tokens_a or not tokens_b:
        return 0.0
    return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)


def cosine_similarity(a: List[float], b: List[float]) -> float:
    if not a or not b or len(a) != len(b):
        return 0.0
    dot = sum(x * y for x, y in zip(a, b))
    norm_a = math.sqrt(sum(x * x for x in a))
    norm_b = math.sqrt(sum(y * y for y in b))
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)

@traceable(run_type="embedding", name="get_embedding")
def get_embedding(text: str) -> Optional[List[float]]:
    """
    调用 OpenAI Embedding API，将文本编码为向量。
    """
    try:
        resp = client.embeddings.create(
            model=EMBEDDING_MODEL,
            input=[text],
        )
        return resp.data[0].embedding
    except Exception as e:
        # 打到 Space 的 log，便于排查
        print(f"[Embedding error] {repr(e)}")
        return None


def find_similar_past_question(
    message: str,
    history: List[Tuple[str, str]],
    jaccard_threshold: float = 0.65,
    embedding_threshold: float = 0.85,
    max_turns_to_check: int = 6,
) -> Optional[Tuple[str, str, float]]:
    """
    在最近若干轮历史对话中查找与当前问题相似的既往问题。
    两级检测：先 Jaccard，再 Embedding。
    返回 (past_question, past_answer, similarity_score) 或 None
    """
    norm_msg = _normalize_text(message)
    if not norm_msg:
        return None

    # 1) Jaccard
    best_sim_j = 0.0
    best_pair_j: Optional[Tuple[str, str]] = None
    checked = 0

    for user_q, assistant_a in reversed(history):
        checked += 1
        if checked > max_turns_to_check:
            break

        norm_hist_q = _normalize_text(user_q)
        if not norm_hist_q:
            continue

        if norm_msg == norm_hist_q:
            return user_q, assistant_a, 1.0

        sim_j = _jaccard_similarity(norm_msg, norm_hist_q)
        if sim_j > best_sim_j:
            best_sim_j = sim_j
            best_pair_j = (user_q, assistant_a)

    if best_pair_j and best_sim_j >= jaccard_threshold:
        return best_pair_j[0], best_pair_j[1], best_sim_j

    # 2) Embedding 语义相似度
    if not history:
        return None

    msg_emb = get_embedding(message)
    if msg_emb is None:
        return None

    best_sim_e = 0.0
    best_pair_e: Optional[Tuple[str, str]] = None
    checked = 0

    for user_q, assistant_a in reversed(history):
        checked += 1
        if checked > max_turns_to_check:
            break

        hist_emb = get_embedding(user_q)
        if hist_emb is None:
            continue

        sim_e = cosine_similarity(msg_emb, hist_emb)
        if sim_e > best_sim_e:
            best_sim_e = sim_e
            best_pair_e = (user_q, assistant_a)

    if best_pair_e and best_sim_e >= embedding_threshold:
        return best_pair_e[0], best_pair_e[1], best_sim_e

    return None


@traceable(run_type="llm", name="safe_chat_completion")
def safe_chat_completion(
    model_name: str,
    messages: List[Dict[str, str]],
    lang: str,
    op: str = "chat",
    temperature: float = 0.5,
) -> str:
    """
    统一安全调用 OpenAI Chat Completion：
    - 最多尝试 2 次
    - 每次请求 timeout = 20 秒
    - 第一次用学生选择的模型；出错后，如果不是 DEFAULT_MODEL，则自动回退到 DEFAULT_MODEL 再试一次
    - 所有异常都会打印到后台 log，但对学生只返回友好的中/英文错误文案
    """
    preferred_model = model_name or DEFAULT_MODEL
    last_error: Optional[Exception] = None

    for attempt in range(2):
        # 第一次用学生指定模型，第二次（如果需要）切到默认模型
        if attempt == 0:
            current_model = preferred_model
        else:
            current_model = DEFAULT_MODEL

        try:
            resp = client.chat.completions.create(
                model=current_model,
                messages=messages,
                temperature=temperature,
                timeout=20,  # 20 秒超时
            )
            return resp.choices[0].message.content
        except Exception as e:
            print(
                f"[safe_chat_completion][{op}] attempt {attempt+1} "
                f"failed with model={current_model}: {repr(e)}"
            )
            last_error = e

            # 如果已经用的是默认模型，或者已经是第二次尝试，就跳出循环
            if current_model == DEFAULT_MODEL or attempt == 1:
                break

    # 两次都失败，返回友好的错误文案
    return build_error_message(last_error or Exception("unknown error"), lang, op)


# ---------- 构建 messages ----------
def build_messages(
    user_message: str,
    history: List[Tuple[str, str]],
    language_preference: str,
    learning_mode: str,
    doc_type: str,
    course_outline: Optional[List[str]],
    weaknesses: Optional[List[str]],
    cognitive_state: Optional[Dict[str, int]],
    rag_context: Optional[str] = None,   # 新增：RAG 检索结果
) -> List[Dict[str, str]]:
    messages: List[Dict[str, str]] = [
        {"role": "system", "content": CLARE_SYSTEM_PROMPT}
    ]

    # 学习模式
    if learning_mode in LEARNING_MODE_INSTRUCTIONS:
        mode_instruction = LEARNING_MODE_INSTRUCTIONS[learning_mode]
        messages.append(
            {
                "role": "system",
                "content": f"Current learning mode: {learning_mode}. {mode_instruction}",
            }
        )

    # 课程大纲
    topics = course_outline if course_outline else DEFAULT_COURSE_TOPICS
    topics_text = " | ".join(topics)
    messages.append(
        {
            "role": "system",
            "content": (
                "Here is the course syllabus context. Use this to stay aligned "
                "with the course topics when answering: "
                + topics_text
            ),
        }
    )

    # 上传文件类型提示
    if doc_type and doc_type != "Syllabus":
        messages.append(
            {
                "role": "system",
                "content": (
                    f"The student also uploaded a {doc_type} document as supporting material. "
                    "You do not see the full content directly, but you may assume it is relevant "
                    "to the same course and topics."
                ),
            }
        )

    # 学生弱项提示
    if weaknesses:
        weak_text = " | ".join(weaknesses[-5:])
        messages.append(
            {
                "role": "system",
                "content": (
                    "The student seems to struggle with the following questions or topics. "
                    "Be extra gentle and clear when these appear: " + weak_text
                ),
            }
        )

    # 认知状态提示
    if cognitive_state:
        confusion = cognitive_state.get("confusion", 0)
        mastery = cognitive_state.get("mastery", 0)
        if confusion >= 2 and confusion >= mastery + 1:
            messages.append(
                {
                    "role": "system",
                    "content": (
                        "The student is currently under HIGH cognitive load. "
                        "Use simpler language, shorter steps, and more concrete examples. "
                        "Avoid long derivations in a single answer, and check understanding "
                        "frequently."
                    ),
                }
            )
        elif mastery >= 2 and mastery >= confusion + 1:
            messages.append(
                {
                    "role": "system",
                    "content": (
                        "The student seems comfortable with the material. "
                        "You may increase difficulty slightly, introduce deeper follow-up "
                        "questions, and connect concepts across topics."
                    ),
                }
            )
        else:
            messages.append(
                {
                    "role": "system",
                    "content": (
                        "The student's cognitive state is mixed or uncertain. "
                        "Keep explanations clear and moderately paced, and probe for "
                        "understanding with short questions."
                    ),
                }
            )

    # 语言偏好控制
    if language_preference == "English":
        messages.append(
            {"role": "system", "content": "Please answer in English."}
        )
    elif language_preference == "中文":
        messages.append(
            {"role": "system", "content": "请用中文回答学生的问题。"}
        )

    # Session 内记忆摘要
    session_memory_text = build_session_memory_summary(
        history=history,
        weaknesses=weaknesses,
        cognitive_state=cognitive_state,
    )
    messages.append(
        {
            "role": "system",
            "content": (
                "Here is a short summary of this session's memory (only within the current chat; "
                "it is not persisted across sessions). Use it to stay consistent with the "
                "student's previous questions, difficulties, and cognitive state: "
                + session_memory_text
            ),
        }
    )

    # RAG 检索结果
    if rag_context:
        messages.append(
            {
                "role": "system",
                "content": (
                    "Here are some relevant excerpts from the course materials. "
                    "Use them as the primary factual grounding when answering the student's question. "
                    "If there is any conflict between these excerpts and your prior knowledge, "
                    "prefer the excerpts.\n\n"
                    + rag_context
                ),
            }
        )

    # 历史对话
    for user, assistant in history:
        messages.append({"role": "user", "content": user})
        if assistant is not None:
            messages.append({"role": "assistant", "content": assistant})

    # 当前输入
    messages.append({"role": "user", "content": user_message})
    return messages

# 装饰器
@traceable(run_type="chain", name="chat_with_clare")
def chat_with_clare(
    message: str,
    history: List[Tuple[str, str]],
    model_name: str,
    language_preference: str,
    learning_mode: str,
    doc_type: str,
    course_outline: Optional[List[str]],
    weaknesses: Optional[List[str]],
    cognitive_state: Optional[Dict[str, int]],
    rag_context: Optional[str] = None,
) -> Tuple[str, List[Tuple[str, str]]]:
    try:
        set_run_metadata(
            learning_mode=learning_mode,
            language_preference=language_preference,
            doc_type=doc_type,
        )
    except Exception as e:
        print(f"[LangSmith metadata error in chat_with_clare] {repr(e)}")


    # 构建 messages
    messages = build_messages(
        user_message=message,
        history=history,
        language_preference=language_preference,
        learning_mode=learning_mode,
        doc_type=doc_type,
        course_outline=course_outline,
        weaknesses=weaknesses,
        cognitive_state=cognitive_state,
        rag_context=rag_context,
    )

    # 统一安全调用
    answer = safe_chat_completion(
        model_name=model_name,
        messages=messages,
        lang=language_preference,
        op="chat",
        temperature=0.5,
    )

    history = history + [(message, answer)]
    return answer, history


# ---------- 导出对话为 Markdown ----------
def export_conversation(
    history: List[Tuple[str, str]],
    course_outline: List[str],
    learning_mode_val: str,
    weaknesses: List[str],
    cognitive_state: Optional[Dict[str, int]],
) -> str:
    lines: List[str] = []
    lines.append("# Clare – Conversation Export\n")
    lines.append(f"- Learning mode: **{learning_mode_val}**\n")
    lines.append("- Course topics (short): " + "; ".join(course_outline[:5]) + "\n")
    lines.append(f"- Cognitive state snapshot: {describe_cognitive_state(cognitive_state)}\n")

    if weaknesses:
        lines.append("- Observed student difficulties:\n")
        for w in weaknesses[-5:]:
            lines.append(f"  - {w}\n")
    lines.append("\n---\n\n")

    for user, assistant in history:
        lines.append(f"**Student:** {user}\n\n")
        lines.append(f"**Clare:** {assistant}\n\n")
        lines.append("---\n\n")

    return "".join(lines)


# ---------- 生成 3 个 quiz 题目 ----------
from langsmith import traceable

@traceable(run_type="chain", name="generate_quiz_from_history")
def generate_quiz_from_history(
    history: List[Tuple[str, str]],
    course_outline: List[str],
    weaknesses: List[str],
    cognitive_state: Optional[Dict[str, int]],
    model_name: str,
    language_preference: str,
) -> str:
    conversation_text = ""
    for user, assistant in history[-8:]:
        conversation_text += f"Student: {user}\nClare: {assistant}\n"

    topics_text = "; ".join(course_outline[:8])
    weakness_text = "; ".join(weaknesses[-5:]) if weaknesses else "N/A"
    cog_text = describe_cognitive_state(cognitive_state)

    messages = [
        {"role": "system", "content": CLARE_SYSTEM_PROMPT},
        {
            "role": "system",
            "content": (
                "Now your task is to create a **short concept quiz** for the student. "
                "Based on the conversation and course topics, generate **3 questions** "
                "(a mix of multiple-choice and short-answer is fine). After listing the "
                "questions, provide an answer key at the end under a heading 'Answer Key'. "
                "Number the questions Q1, Q2, Q3. Adjust the difficulty according to the "
                "student's cognitive state."
            ),
        },
        {
            "role": "system",
            "content": f"Course topics: {topics_text}",
        },
        {
            "role": "system",
            "content": f"Student known difficulties: {weakness_text}",
        },
        {
            "role": "system",
            "content": f"Student cognitive state: {cog_text}",
        },
        {
            "role": "user",
            "content": (
                "Here is the recent conversation between you and the student:\n\n"
                + conversation_text
                + "\n\nPlease create the quiz now."
            ),
        },
    ]

    if language_preference == "中文":
        messages.append(
            {
                "role": "system",
                "content": "请用中文给出问题和答案。",
            }
        )

    quiz_text = safe_chat_completion(
        model_name=model_name,
        messages=messages,
        lang=language_preference,
        op="quiz",
        temperature=0.5,
    )
    return quiz_text


# ---------- 概念总结（知识点摘要） ----------
@traceable(run_type="chain", name="summarize_conversation")
def summarize_conversation(
    history: List[Tuple[str, str]],
    course_outline: List[str],
    weaknesses: List[str],
    cognitive_state: Optional[Dict[str, int]],
    model_name: str,
    language_preference: str,
) -> str:
    conversation_text = ""
    for user, assistant in history[-10:]:
        conversation_text += f"Student: {user}\nClare: {assistant}\n"

    topics_text = "; ".join(course_outline[:8])
    weakness_text = "; ".join(weaknesses[-5:]) if weaknesses else "N/A"
    cog_text = describe_cognitive_state(cognitive_state)

    messages = [
        {"role": "system", "content": CLARE_SYSTEM_PROMPT},
        {
            "role": "system",
            "content": (
                "Your task now is to produce a **concept-only summary** of this tutoring "
                "session. Only include knowledge points, definitions, key formulas, "
                "examples, and main takeaways. Do **not** include any personal remarks, "
                "jokes, or off-topic chat. Write in clear bullet points. This summary "
                "should be suitable for the student to paste into their study notes. "
                "Take into account what the student struggled with and their cognitive state."
            ),
        },
        {
            "role": "system",
            "content": f"Course topics context: {topics_text}",
        },
        {
            "role": "system",
            "content": f"Student known difficulties: {weakness_text}",
        },
        {
            "role": "system",
            "content": f"Student cognitive state: {cog_text}",
        },
        {
            "role": "user",
            "content": (
                "Here is the recent conversation between you and the student:\n\n"
                + conversation_text
                + "\n\nPlease summarize only the concepts and key ideas learned."
            ),
        },
    ]

    if language_preference == "中文":
        messages.append(
            {
                "role": "system",
                "content": "请用中文给出要点总结，只保留知识点和结论，使用条目符号。"
            }
        )

    summary_text = safe_chat_completion(
        model_name=model_name,
        messages=messages,
        lang=language_preference,
        op="summary",
        temperature=0.4,
    )
    return summary_text