AI_Agent_Final

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 5, 2025

Commit

ddeab24

verified ·

1 Parent(s): e285f2a

Update rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +28 -15

rag_engine.py CHANGED Viewed

@@ -1,8 +1,9 @@
 # rag_engine.py
-from typing import List, Dict, Optional
 from clare_core import (
-    parse_syllabus_docx,
     get_embedding,
     cosine_similarity,
 )
@@ -12,26 +13,38 @@ from langsmith.run_helpers import set_run_metadata
 def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
     """
-    从上传的文件构建 RAG chunk 列表（session 级别）：
-    - 目前只支持 .docx
-    - 使用 parse_syllabus_docx 把文档按段落切片
-    - 对每个非空段落做 embedding，存成 {"text": str, "embedding": List[float]}
     """
     if file is None:
         return []
     try:
-        file_path = file.name
-        if not file_path.lower().endswith(".docx"):
-            # 目前先只支持 docx，后面可以扩展 pdf / txt
             return []
-        # 多取一些行，比课程大纲用的 15 更长
-        paragraphs = parse_syllabus_docx(file_path, max_lines=100)
         chunks: List[Dict] = []
-        for para in paragraphs:
-            text = para.strip()
             if not text:
                 continue
             emb = get_embedding(text)
@@ -39,7 +52,7 @@ def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
                 continue
             chunks.append({"text": text, "embedding": emb})
-        print(f"[RAG] built {len(chunks)} chunks from uploaded file")
         return chunks
     except Exception as e:

 # rag_engine.py
+import os
+from typing import List, Dict
+from syllabus_utils import parse_syllabus_docx, parse_syllabus_pdf
 from clare_core import (
     get_embedding,
     cosine_similarity,
 )
 def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
     """
+    从上传的文件构建 RAG chunk 列表（session 级别）。
+    - 支持 .docx 和 .pdf
+    - 复用 syllabus_utils 里的解析函数，把文档切成一系列文本块
+    - 对每个非空文本块做 embedding，存成 {"text": str, "embedding": List[float]}
+    当前 doc_type_val 主要用于未来扩展（不同类型文件可采用不同切块策略），
+    这里先不区分，统一按段落/块处理。
     """
     if file is None:
         return []
+    file_path = getattr(file, "name", None)
+    if not file_path:
+        return []
+    ext = os.path.splitext(file_path)[1].lower()
     try:
+        # 1) 解析文件 → 得到一组文本块
+        if ext == ".docx":
+            texts = parse_syllabus_docx(file_path)
+        elif ext == ".pdf":
+            texts = parse_syllabus_pdf(file_path)
+        else:
+            print(f"[RAG] unsupported file type for RAG: {ext}")
             return []
+        # 2) 对每个文本块做 embedding
         chunks: List[Dict] = []
+        for t in texts:
+            text = t.strip()
             if not text:
                 continue
             emb = get_embedding(text)
                 continue
             chunks.append({"text": text, "embedding": emb})
+        print(f"[RAG] built {len(chunks)} chunks from uploaded file ({ext}, doc_type={doc_type_val})")
         return chunks
     except Exception as e: