test_AI_Agent

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 12, 2025

Commit

fe9dde2

verified ·

1 Parent(s): 21a9f67

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +32 -37

rag_engine.py CHANGED Viewed

@@ -17,32 +17,34 @@ from langsmith.run_helpers import set_run_metadata
 def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
     """
-    从上传的文件构建 RAG chunk 列表（session 级别）。
-    - 支持 .docx / .pdf / .pptx
-    - 复用 syllabus_utils 里的解析函数，把文档切成一系列文本块
-    - 对每个非空文本块做 embedding
     每个 chunk 结构：
     {
         "text": str,
         "embedding": List[float],
-        "source_file": str,   # 文件名（用于 UI reference）
-        "section": str,       # 简易 section 标记，如 "Syllabus – Section 3"
     }
     """
-    if file is None:
-        return []
-    file_path = getattr(file, "name", None)
     if not file_path:
         return []
     ext = os.path.splitext(file_path)[1].lower()
-    file_name = os.path.basename(file_path)
     try:
-        # 1) 解析文件 → 文本块列表
         if ext == ".docx":
             texts = parse_syllabus_docx(file_path)
         elif ext == ".pdf":
@@ -53,32 +55,28 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
             print(f"[RAG] unsupported file type for RAG: {ext}")
             return []
-        # 2) 对每个文本块做 embedding，同时写入 metadata
         chunks: List[Dict] = []
         for idx, t in enumerate(texts):
             text = (t or "").strip()
             if not text:
                 continue
             emb = get_embedding(text)
             if emb is None:
                 continue
-            # 简易的 section 标记：<doc_type> – Section <n>
-            section_label = f"{doc_type_val} – Section {idx + 1}"
             chunks.append(
                 {
                     "text": text,
                     "embedding": emb,
-                    "source_file": file_name,
                     "section": section_label,
                 }
             )
         print(
-            f"[RAG] built {len(chunks)} chunks from uploaded file "
-            f"({file_name}, ext={ext}, doc_type={doc_type_val})"
         )
         return chunks
@@ -94,13 +92,11 @@ def retrieve_relevant_chunks(
     top_k: int = 3,
 ) -> Tuple[str, List[Dict]]:
     """
-    用 embedding 对当前问题做一次检索，从 rag_chunks 中找出最相关的 top_k 段落。
-    返回:
-        context_text: str        # 拼接后的文本，给 LLM prompt 使用
-        top_chunks:   List[Dict] # 本次实际使用到的 chunks（带 source_file / section）
-    同时将检索结果写入 LangSmith metadata，便于后续观测。
     """
     if not rag_chunks:
         return "", []
@@ -121,33 +117,32 @@ def retrieve_relevant_chunks(
     if not scored:
         return "", []
-    # 按相似度从高到低排序
     scored.sort(key=lambda x: x[0], reverse=True)
     top_items = scored[:top_k]
-    # 取出 top_k 的 chunk dict
-    top_chunks: List[Dict] = [item for _sim, item in top_items]
-    # 拼接文本给模型使用
-    context_text = "\n---\n".join(ch["text"] for ch in top_chunks if ch.get("text"))
-    # 将一些预览信息写到 LangSmith metadata
     try:
         previews = [
             {
                 "score": float(sim),
-                "text_preview": (item.get("text") or "")[:300],
-                "source_file": item.get("source_file"),
-                "section": item.get("section"),
             }
-            for sim, item in top_items
         ]
         set_run_metadata(
             question=question,
             retrieved_chunks=previews,
         )
     except Exception as e:
-        # observability 出错不能影响主流程
         print(f"[LangSmith metadata error in retrieve_relevant_chunks] {repr(e)}")
-    return context_text, top_chunks

 def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
     """
+    从文件构建 RAG chunk 列表（session 级别）。
+    支持两种输入形式：
+    - file 是上传文件对象（带 .name）
+    - file 是字符串路径（用于预加载 Module10）
     每个 chunk 结构：
     {
         "text": str,
         "embedding": List[float],
+        "source_file": "module10_responsible_ai.pdf",
+        "section": "Literature Review / Paper – chunk 3"
     }
     """
+    # 1) 统一拿到文件路径
+    if isinstance(file, str):
+        file_path = file
+    else:
+        file_path = getattr(file, "name", None)
     if not file_path:
         return []
     ext = os.path.splitext(file_path)[1].lower()
+    basename = os.path.basename(file_path)
     try:
+        # 2) 解析文件 → 文本块列表
         if ext == ".docx":
             texts = parse_syllabus_docx(file_path)
         elif ext == ".pdf":
             print(f"[RAG] unsupported file type for RAG: {ext}")
             return []
+        # 3) 对每个文本块做 embedding，并附上 metadata
         chunks: List[Dict] = []
         for idx, t in enumerate(texts):
             text = (t or "").strip()
             if not text:
                 continue
             emb = get_embedding(text)
             if emb is None:
                 continue
+            section_label = f"{doc_type_val} – chunk {idx + 1}"
             chunks.append(
                 {
                     "text": text,
                     "embedding": emb,
+                    "source_file": basename,
                     "section": section_label,
                 }
             )
         print(
+            f"[RAG] built {len(chunks)} chunks from file ({ext}, doc_type={doc_type_val}, path={basename})"
         )
         return chunks
     top_k: int = 3,
 ) -> Tuple[str, List[Dict]]:
     """
+    用 embedding 对当前问题做检索，从 rag_chunks 中找出最相关的 top_k 段落。
+    返回：
+    - context_text: 拼接后的文本（给 LLM 用）
+    - used_chunks:   本轮实际用到的 chunk 列表（给 reference 用）
     """
     if not rag_chunks:
         return "", []
     if not scored:
         return "", []
     scored.sort(key=lambda x: x[0], reverse=True)
     top_items = scored[:top_k]
+    # 供 LLM 使用的拼接上下文
+    top_texts = [it["text"] for _sim, it in top_items]
+    context_text = "\n---\n".join(top_texts)
+    # 供 reference & logging 使用的详细 chunk
+    used_chunks = [it for _sim, it in top_items]
+    # LangSmith metadata（可选）
     try:
         previews = [
             {
                 "score": float(sim),
+                "text_preview": it["text"][:200],
+                "source_file": it.get("source_file"),
+                "section": it.get("section"),
             }
+            for sim, it in top_items
         ]
         set_run_metadata(
             question=question,
             retrieved_chunks=previews,
         )
     except Exception as e:
         print(f"[LangSmith metadata error in retrieve_relevant_chunks] {repr(e)}")
+    return context_text, used_chunks