test_AI_Agent

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 12, 2025

Commit

52e07b7

verified ·

1 Parent(s): f30f379

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +65 -23

rag_engine.py CHANGED Viewed

@@ -1,15 +1,19 @@
 # rag_engine.py
 import os
-from typing import List, Dict
-from syllabus_utils import parse_syllabus_docx, parse_syllabus_pdf
 from clare_core import (
     get_embedding,
     cosine_similarity,
 )
 from langsmith import traceable
 from langsmith.run_helpers import set_run_metadata
-from syllabus_utils import parse_syllabus_docx, parse_syllabus_pdf, parse_pptx_slides
 def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
     """
@@ -17,7 +21,15 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
     - 支持 .docx / .pdf / .pptx
     - 复用 syllabus_utils 里的解析函数，把文档切成一系列文本块
-    - 对每个非空文本块做 embedding，存成 {"text": str, "embedding": List[float]}
     """
     if file is None:
         return []
@@ -27,6 +39,7 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
         return []
     ext = os.path.splitext(file_path)[1].lower()
     try:
         # 1) 解析文件 → 文本块列表
@@ -40,18 +53,33 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
             print(f"[RAG] unsupported file type for RAG: {ext}")
             return []
-        # 2) 对每个文本块做 embedding
         chunks: List[Dict] = []
-        for t in texts:
-            text = t.strip()
             if not text:
                 continue
             emb = get_embedding(text)
             if emb is None:
                 continue
-            chunks.append({"text": text, "embedding": emb})
-        print(f"[RAG] built {len(chunks)} chunks from uploaded file ({ext}, doc_type={doc_type_val})")
         return chunks
     except Exception as e:
@@ -64,18 +92,22 @@ def retrieve_relevant_chunks(
     question: str,
     rag_chunks: List[Dict],
     top_k: int = 3,
-) -> str:
     """
-    用 embedding 对当前问题做一次检索，从 rag_chunks 中找出最相关的 top_k 段落，
-    返回拼接后的文本，供 prompt 使用。
-    （增强版本：将检索内容记录到 LangSmith metadata）
     """
     if not rag_chunks:
-        return ""
     q_emb = get_embedding(question)
     if q_emb is None:
-        return ""
     scored = []
     for item in rag_chunks:
@@ -84,20 +116,31 @@ def retrieve_relevant_chunks(
         if not emb or not text:
             continue
         sim = cosine_similarity(q_emb, emb)
-        scored.append((sim, text))
     if not scored:
-        return ""
     scored.sort(key=lambda x: x[0], reverse=True)
     top_items = scored[:top_k]
-    top_chunks = [t for _sim, t in top_items]
-    # 使用 set_run_metadata 给当前 retriever run 打 metadata
     try:
         previews = [
-            {"score": float(sim), "text_preview": text[:300]}
-            for sim, text in top_items
         ]
         set_run_metadata(
             question=question,
@@ -107,5 +150,4 @@ def retrieve_relevant_chunks(
         # observability 出错不能影响主流程
         print(f"[LangSmith metadata error in retrieve_relevant_chunks] {repr(e)}")
-    # 用分隔线拼接，方便模型辨认不同片段
-    return "\n---\n".join(top_chunks)

 # rag_engine.py
 import os
+from typing import List, Dict, Tuple
+from syllabus_utils import (
+    parse_syllabus_docx,
+    parse_syllabus_pdf,
+    parse_pptx_slides,
+)
 from clare_core import (
     get_embedding,
     cosine_similarity,
 )
 from langsmith import traceable
 from langsmith.run_helpers import set_run_metadata
 def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
     """
     - 支持 .docx / .pdf / .pptx
     - 复用 syllabus_utils 里的解析函数，把文档切成一系列文本块
+    - 对每个非空文本块做 embedding
+    每个 chunk 结构：
+    {
+        "text": str,
+        "embedding": List[float],
+        "source_file": str,   # 文件名（用于 UI reference）
+        "section": str,       # 简易 section 标记，如 "Syllabus – Section 3"
+    }
     """
     if file is None:
         return []
         return []
     ext = os.path.splitext(file_path)[1].lower()
+    file_name = os.path.basename(file_path)
     try:
         # 1) 解析文件 → 文本块列表
             print(f"[RAG] unsupported file type for RAG: {ext}")
             return []
+        # 2) 对每个文本块做 embedding，同时写入 metadata
         chunks: List[Dict] = []
+        for idx, t in enumerate(texts):
+            text = (t or "").strip()
             if not text:
                 continue
             emb = get_embedding(text)
             if emb is None:
                 continue
+            # 简易的 section 标记：<doc_type> – Section <n>
+            section_label = f"{doc_type_val} – Section {idx + 1}"
+            chunks.append(
+                {
+                    "text": text,
+                    "embedding": emb,
+                    "source_file": file_name,
+                    "section": section_label,
+                }
+            )
+        print(
+            f"[RAG] built {len(chunks)} chunks from uploaded file "
+            f"({file_name}, ext={ext}, doc_type={doc_type_val})"
+        )
         return chunks
     except Exception as e:
     question: str,
     rag_chunks: List[Dict],
     top_k: int = 3,
+) -> Tuple[str, List[Dict]]:
     """
+    用 embedding 对当前问题做一次检索，从 rag_chunks 中找出最相关的 top_k 段落。
+    返回:
+        context_text: str        # 拼接后的文本，给 LLM prompt 使用
+        top_chunks:   List[Dict] # 本次实际使用到的 chunks（带 source_file / section）
+    同时将检索结果写入 LangSmith metadata，便于后续观测。
     """
     if not rag_chunks:
+        return "", []
     q_emb = get_embedding(question)
     if q_emb is None:
+        return "", []
     scored = []
     for item in rag_chunks:
         if not emb or not text:
             continue
         sim = cosine_similarity(q_emb, emb)
+        scored.append((sim, item))
     if not scored:
+        return "", []
+    # 按相似度从高到低排序
     scored.sort(key=lambda x: x[0], reverse=True)
     top_items = scored[:top_k]
+    # 取出 top_k 的 chunk dict
+    top_chunks: List[Dict] = [item for _sim, item in top_items]
+    # 拼接文本给模型使用
+    context_text = "\n---\n".join(ch["text"] for ch in top_chunks if ch.get("text"))
+    # 将一些预览信息写到 LangSmith metadata
     try:
         previews = [
+            {
+                "score": float(sim),
+                "text_preview": (item.get("text") or "")[:300],
+                "source_file": item.get("source_file"),
+                "section": item.get("section"),
+            }
+            for sim, item in top_items
         ]
         set_run_metadata(
             question=question,
         # observability 出错不能影响主流程
         print(f"[LangSmith metadata error in retrieve_relevant_chunks] {repr(e)}")
+    return context_text, top_chunks