test_AI_Agent

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 2, 2025

Commit

9f89ffb

verified ·

1 Parent(s): 7c00bd4

Create rag_engine.py

Browse files

Files changed (1) hide show

rag_engine.py +80 -0

rag_engine.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# rag_engine.py
+from typing import List, Dict, Optional
+from clare_core import (
+    parse_syllabus_docx,
+    get_embedding,
+    cosine_similarity,
+)
+def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
+    """
+    从上传的文件构建 RAG chunk 列表（session 级别）：
+    - 目前只支持 .docx
+    - 使用 parse_syllabus_docx 把文档按段落切片
+    - 对每个非空段落做 embedding，存成 {"text": str, "embedding": List[float]}
+    """
+    if file is None:
+        return []
+    try:
+        file_path = file.name
+        if not file_path.lower().endswith(".docx"):
+            # 目前先只支持 docx，后面可以扩展 pdf / txt
+            return []
+        # 多取一些行，比课程大纲用的 15 更长
+        paragraphs = parse_syllabus_docx(file_path, max_lines=100)
+        chunks: List[Dict] = []
+        for para in paragraphs:
+            text = para.strip()
+            if not text:
+                continue
+            emb = get_embedding(text)
+            if emb is None:
+                continue
+            chunks.append({"text": text, "embedding": emb})
+        print(f"[RAG] built {len(chunks)} chunks from uploaded file")
+        return chunks
+    except Exception as e:
+        print(f"[RAG] error while building chunks: {repr(e)}")
+        return []
+def retrieve_relevant_chunks(
+    question: str,
+    rag_chunks: List[Dict],
+    top_k: int = 3,
+) -> str:
+    """
+    用 embedding 对当前问题做一次检索，从 rag_chunks 中找出最相关的 top_k 段落，
+    返回拼接后的文本，供 prompt 使用。
+    """
+    if not rag_chunks:
+        return ""
+    q_emb = get_embedding(question)
+    if q_emb is None:
+        return ""
+    scored = []
+    for item in rag_chunks:
+        emb = item.get("embedding")
+        text = item.get("text", "")
+        if not emb or not text:
+            continue
+        sim = cosine_similarity(q_emb, emb)
+        scored.append((sim, text))
+    if not scored:
+        return ""
+    scored.sort(key=lambda x: x[0], reverse=True)
+    top_chunks = [t for _sim, t in scored[:top_k]]
+    # 用分隔线拼接，方便模型辨认不同片段
+    return "\n---\n".join(top_chunks)