test_AI_Agent

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 2, 2025

Commit

26220ae

verified ·

1 Parent(s): 337c831

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -14

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import re
 from typing import List, Dict, Tuple, Optional
 import gradio as gr
@@ -15,6 +16,7 @@ if not OPENAI_API_KEY:
 client = OpenAI(api_key=OPENAI_API_KEY)
 DEFAULT_MODEL = "gpt-4.1-mini"
 # ---------- 默认 GenAI 课程大纲 ----------
 DEFAULT_COURSE_TOPICS = [
@@ -201,7 +203,6 @@ def _normalize_text(text: str) -> str:
     text = text.lower().strip()
     # 去掉标点符号，只保留字母数字和空格
     text = re.sub(r"[^\w\s]", " ", text)
-    # 合并多余空格
     text = re.sub(r"\s+", " ", text)
     return text
@@ -214,27 +215,58 @@ def _jaccard_similarity(a: str, b: str) -> float:
     return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)
 def find_similar_past_question(
     message: str,
     history: List[Tuple[str, str]],
-    similarity_threshold: float = 0.8,
     max_turns_to_check: int = 6,
 ) -> Optional[Tuple[str, str, float]]:
     """
     在最近若干轮历史对话中查找与当前问题相似的既往问题。
     返回:
-        (past_question, past_answer, similarity) 或 None
     """
     norm_msg = _normalize_text(message)
     if not norm_msg:
         return None
-    best_sim = 0.0
-    best_pair: Optional[Tuple[str, str]] = None
     checked = 0
-    # 从最近一轮往前看
     for user_q, assistant_a in reversed(history):
         checked += 1
         if checked > max_turns_to_check:
@@ -244,17 +276,49 @@ def find_similar_past_question(
         if not norm_hist_q:
             continue
-        # 完全相同直接返回
         if norm_msg == norm_hist_q:
             return user_q, assistant_a, 1.0
-        sim = _jaccard_similarity(norm_msg, norm_hist_q)
-        if sim > best_sim:
-            best_sim = sim
-            best_pair = (user_q, assistant_a)
-    if best_pair and best_sim >= similarity_threshold:
-        return best_pair[0], best_pair[1], best_sim
     return None
@@ -704,7 +768,6 @@ with gr.Blocks(title="Clare – Hanbridge AI Teaching Assistant") as demo:
         dup = find_similar_past_question(message, chat_history)
         if dup is not None:
             past_q, past_a, sim = dup
-            # 直接复用之前回答，并给一个简短提示
             prefix_en = (
                 "I noticed this question is very similar to one you asked earlier, "
                 "so I'm showing the previous explanation again. "

 import os
 import re
+import math
 from typing import List, Dict, Tuple, Optional
 import gradio as gr
 client = OpenAI(api_key=OPENAI_API_KEY)
 DEFAULT_MODEL = "gpt-4.1-mini"
+EMBEDDING_MODEL = "text-embedding-3-small"
 # ---------- 默认 GenAI 课程大纲 ----------
 DEFAULT_COURSE_TOPICS = [
     text = text.lower().strip()
     # 去掉标点符号，只保留字母数字和空格
     text = re.sub(r"[^\w\s]", " ", text)
     text = re.sub(r"\s+", " ", text)
     return text
     return len(tokens_a & tokens_b) / len(tokens_a | tokens_b)
+def cosine_similarity(a: List[float], b: List[float]) -> float:
+    if not a or not b or len(a) != len(b):
+        return 0.0
+    dot = sum(x * y for x, y in zip(a, b))
+    norm_a = math.sqrt(sum(x * x for x in a))
+    norm_b = math.sqrt(sum(y * y for y in b))
+    if norm_a == 0 or norm_b == 0:
+        return 0.0
+    return dot / (norm_a * norm_b)
+def get_embedding(text: str) -> Optional[List[float]]:
+    """
+    调用 OpenAI Embedding API，将文本编码为向量。
+    """
+    try:
+        resp = client.embeddings.create(
+            model=EMBEDDING_MODEL,
+            input=[text],
+        )
+        return resp.data[0].embedding
+    except Exception:
+        # 如果 embedding 调用失败，就返回 None，不阻塞主流程
+        return None
 def find_similar_past_question(
     message: str,
     history: List[Tuple[str, str]],
+    jaccard_threshold: float = 0.65,
+    embedding_threshold: float = 0.85,
     max_turns_to_check: int = 6,
 ) -> Optional[Tuple[str, str, float]]:
     """
     在最近若干轮历史对话中查找与当前问题相似的既往问题。
+    两级检测：
+    1. 先用 Jaccard 做快速近似匹配（文本几乎一样的情况）
+    2. 再用 OpenAI embedding 做语义相似度检测（改写、同义句）
     返回:
+        (past_question, past_answer, similarity_score) 或 None
     """
+    # ---------- 第一步：Jaccard 快速检测 ----------
     norm_msg = _normalize_text(message)
     if not norm_msg:
         return None
+    best_sim_j = 0.0
+    best_pair_j: Optional[Tuple[str, str]] = None
     checked = 0
     for user_q, assistant_a in reversed(history):
         checked += 1
         if checked > max_turns_to_check:
         if not norm_hist_q:
             continue
         if norm_msg == norm_hist_q:
+            # 完全相同，直接视为重复
             return user_q, assistant_a, 1.0
+        sim_j = _jaccard_similarity(norm_msg, norm_hist_q)
+        if sim_j > best_sim_j:
+            best_sim_j = sim_j
+            best_pair_j = (user_q, assistant_a)
+    if best_pair_j and best_sim_j >= jaccard_threshold:
+        # 词面高度相似，直接视为重复
+        return best_pair_j[0], best_pair_j[1], best_sim_j
+    # ---------- 第二步：Embedding 语义相似度 ----------
+    # 如果历史太少，就没必要算 embedding
+    if not history:
+        return None
+    msg_emb = get_embedding(message)
+    if msg_emb is None:
+        # embedding 调用失败，放弃语义检测
+        return None
+    best_sim_e = 0.0
+    best_pair_e: Optional[Tuple[str, str]] = None
+    checked = 0
+    for user_q, assistant_a in reversed(history):
+        checked += 1
+        if checked > max_turns_to_check:
+            break
+        hist_emb = get_embedding(user_q)
+        if hist_emb is None:
+            continue
+        sim_e = cosine_similarity(msg_emb, hist_emb)
+        if sim_e > best_sim_e:
+            best_sim_e = sim_e
+            best_pair_e = (user_q, assistant_a)
+    if best_pair_e and best_sim_e >= embedding_threshold:
+        return best_pair_e[0], best_pair_e[1], best_sim_e
     return None
         dup = find_similar_past_question(message, chat_history)
         if dup is not None:
             past_q, past_a, sim = dup
             prefix_en = (
                 "I noticed this question is very similar to one you asked earlier, "
                 "so I'm showing the previous explanation again. "