test_AI_Agent

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 19, 2025

Commit

a2a2d14

verified ·

1 Parent(s): 37cc1a4

Update api/rag_engine.py

Browse files

Files changed (1) hide show

api/rag_engine.py +179 -119

api/rag_engine.py CHANGED Viewed

@@ -1,148 +1,208 @@
-# rag_engine.py
 import os
-from typing import List, Dict, Tuple
-from syllabus_utils import (
-    parse_syllabus_docx,
-    parse_syllabus_pdf,
-    parse_pptx_slides,
-)
-from clare_core import (
-    get_embedding,
-    cosine_similarity,
-)
-from langsmith import traceable
-from langsmith.run_helpers import set_run_metadata
-def build_rag_chunks_from_file(file, doc_type_val: str) -> List[Dict]:
     """
-    从文件构建 RAG chunk 列表（session 级别）。
-    支持两种输入形式：
-    - file 是上传文件对象（带 .name）
-    - file 是字符串路径（用于预加载 Module10）
-    每个 chunk 结构：
-    {
-        "text": str,
-        "embedding": List[float],
-        "source_file": "module10_responsible_ai.pdf",
-        "section": "Literature Review / Paper – chunk 3"
-    }
     """
-    # 1) 统一拿到文件路径
-    if isinstance(file, str):
-        file_path = file
-    else:
-        file_path = getattr(file, "name", None)
-    if not file_path:
         return []
-    ext = os.path.splitext(file_path)[1].lower()
-    basename = os.path.basename(file_path)
     try:
-        # 2) 解析文件 → 文本块列表
-        if ext == ".docx":
-            texts = parse_syllabus_docx(file_path)
-        elif ext == ".pdf":
-            texts = parse_syllabus_pdf(file_path)
         elif ext == ".pptx":
-            texts = parse_pptx_slides(file_path)
         else:
-            print(f"[RAG] unsupported file type for RAG: {ext}")
             return []
-        # 3) 对每个文本块做 embedding，并附上 metadata
-        chunks: List[Dict] = []
-        for idx, t in enumerate(texts):
-            text = (t or "").strip()
-            if not text:
-                continue
-            emb = get_embedding(text)
-            if emb is None:
-                continue
-            section_label = f"{doc_type_val} – chunk {idx + 1}"
             chunks.append(
                 {
-                    "text": text,
-                    "embedding": emb,
-                    "source_file": basename,
-                    "section": section_label,
                 }
             )
-        print(
-            f"[RAG] built {len(chunks)} chunks from file ({ext}, doc_type={doc_type_val}, path={basename})"
-        )
-        return chunks
-    except Exception as e:
-        print(f"[RAG] error while building chunks: {repr(e)}")
-        return []
-@traceable(run_type="retriever", name="retrieve_relevant_chunks")
 def retrieve_relevant_chunks(
-    question: str,
-    rag_chunks: List[Dict],
-    top_k: int = 3,
 ) -> Tuple[str, List[Dict]]:
     """
-    用 embedding 对当前问题做检索，从 rag_chunks 中找出最相关的 top_k 段落。
-    返回：
-    - context_text: 拼接后的文本（给 LLM 用）
-    - used_chunks:   本轮实际用到的 chunk 列表（给 reference 用）
     """
-    if not rag_chunks:
         return "", []
-    q_emb = get_embedding(question)
-    if q_emb is None:
         return "", []
-    scored = []
-    for item in rag_chunks:
-        emb = item.get("embedding")
-        text = item.get("text", "")
-        if not emb or not text:
-            continue
-        sim = cosine_similarity(q_emb, emb)
-        scored.append((sim, item))
-    if not scored:
-        return "", []
     scored.sort(key=lambda x: x[0], reverse=True)
-    top_items = scored[:top_k]
-    # 供 LLM 使用的拼接上下文
-    top_texts = [it["text"] for _sim, it in top_items]
-    context_text = "\n---\n".join(top_texts)
-    # 供 reference & logging 使用的详细 chunk
-    used_chunks = [it for _sim, it in top_items]
-    # LangSmith metadata（可选）
-    try:
-        previews = [
-            {
-                "score": float(sim),
-                "text_preview": it["text"][:200],
-                "source_file": it.get("source_file"),
-                "section": it.get("section"),
-            }
-            for sim, it in top_items
-        ]
-        set_run_metadata(
-            question=question,
-            retrieved_chunks=previews,
-        )
-    except Exception as e:
-        print(f"[LangSmith metadata error in retrieve_relevant_chunks] {repr(e)}")
-    return context_text, used_chunks

+# api/rag_engine.py
+"""
+RAG engine:
+- build_rag_chunks_from_file(path, doc_type) -> List[chunk]
+- retrieve_relevant_chunks(query, chunks) -> (context_text, used_chunks)
+Chunk format (MVP):
+{
+  "text": str,
+  "source_file": str,
+  "section": str
+}
+"""
 import os
+import re
+from typing import Dict, List, Tuple
+from pypdf import PdfReader
+from docx import Document
+from pptx import Presentation
+# IMPORTANT: now under api/
+from api.syllabus_utils import parse_pptx_slides  # optional reuse
+from api.config import DEFAULT_COURSE_TOPICS
+# ----------------------------
+# Helpers
+# ----------------------------
+def _clean_text(s: str) -> str:
+    s = (s or "").replace("\r", "\n")
+    s = re.sub(r"\n{3,}", "\n\n", s)
+    return s.strip()
+def _split_into_chunks(text: str, max_chars: int = 1400) -> List[str]:
     """
+    Simple deterministic chunker:
+    - split by blank lines
+    - then pack into <= max_chars
     """
+    text = _clean_text(text)
+    if not text:
+        return []
+    paras = [p.strip() for p in text.split("\n\n") if p.strip()]
+    chunks: List[str] = []
+    buf = ""
+    for p in paras:
+        if not buf:
+            buf = p
+            continue
+        if len(buf) + 2 + len(p) <= max_chars:
+            buf = buf + "\n\n" + p
+        else:
+            chunks.append(buf)
+            buf = p
+    if buf:
+        chunks.append(buf)
+    return chunks
+def _file_label(path: str) -> str:
+    return os.path.basename(path) if path else "uploaded_file"
+# ----------------------------
+# Parsers
+# ----------------------------
+def _parse_pdf_to_text(path: str) -> List[Tuple[str, str]]:
+    """
+    Returns list of (section_label, text)
+    section_label uses page numbers.
+    """
+    reader = PdfReader(path)
+    out: List[Tuple[str, str]] = []
+    for i, page in enumerate(reader.pages):
+        t = page.extract_text() or ""
+        t = _clean_text(t)
+        if t:
+            out.append((f"p{i+1}", t))
+    return out
+def _parse_docx_to_text(path: str) -> List[Tuple[str, str]]:
+    doc = Document(path)
+    paras = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
+    if not paras:
+        return []
+    full = "\n\n".join(paras)
+    return [("docx", _clean_text(full))]
+def _parse_pptx_to_text(path: str) -> List[Tuple[str, str]]:
+    prs = Presentation(path)
+    out: List[Tuple[str, str]] = []
+    for idx, slide in enumerate(prs.slides, start=1):
+        lines: List[str] = []
+        for shape in slide.shapes:
+            if hasattr(shape, "text") and shape.text:
+                txt = shape.text.strip()
+                if txt:
+                    lines.append(txt)
+        if lines:
+            out.append((f"slide{idx}", _clean_text("\n".join(lines))))
+    return out
+# ----------------------------
+# Public API
+# ----------------------------
+def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
+    """
+    Build RAG chunks from a local file path.
+    Supports: .pdf / .docx / .pptx / .txt
+    """
+    if not path or not os.path.exists(path):
         return []
+    ext = os.path.splitext(path)[1].lower()
+    source_file = _file_label(path)
+    # Parse into (section, text blocks)
+    sections: List[Tuple[str, str]] = []
     try:
+        if ext == ".pdf":
+            sections = _parse_pdf_to_text(path)
+        elif ext == ".docx":
+            sections = _parse_docx_to_text(path)
         elif ext == ".pptx":
+            sections = _parse_pptx_to_text(path)
+        elif ext in [".txt", ".md"]:
+            with open(path, "r", encoding="utf-8", errors="ignore") as f:
+                sections = [("text", _clean_text(f.read()))]
         else:
+            # Unsupported file type: return empty (safe)
+            print(f"[rag_engine] unsupported file type: {ext}")
             return []
+    except Exception as e:
+        print(f"[rag_engine] parse error for {source_file}: {repr(e)}")
+        return []
+    chunks: List[Dict] = []
+    for section, text in sections:
+        # Split section text into smaller chunks
+        for j, piece in enumerate(_split_into_chunks(text), start=1):
             chunks.append(
                 {
+                    "text": piece,
+                    "source_file": source_file,
+                    "section": f"{section}#{j}",
+                    "doc_type": doc_type,
                 }
             )
+    return chunks
 def retrieve_relevant_chunks(
+    query: str, chunks: List[Dict], k: int = 4, max_context_chars: int = 2800
 ) -> Tuple[str, List[Dict]]:
     """
+    Deterministic lightweight retrieval (no embeddings):
+    - score by token overlap (very fast)
+    - return top-k chunks concatenated as context
     """
+    query = _clean_text(query)
+    if not query or not chunks:
         return "", []
+    q_tokens = set(re.findall(r"[a-zA-Z0-9]+", query.lower()))
+    if not q_tokens:
         return "", []
+    scored: List[Tuple[int, Dict]] = []
+    for c in chunks:
+        text = (c.get("text") or "")
+        t_tokens = set(re.findall(r"[a-zA-Z0-9]+", text.lower()))
+        score = len(q_tokens.intersection(t_tokens))
+        if score > 0:
+            scored.append((score, c))
     scored.sort(key=lambda x: x[0], reverse=True)
+    top = [c for _, c in scored[:k]]
+    # Build context text
+    buf_parts: List[str] = []
+    used: List[Dict] = []
+    total = 0
+    for c in top:
+        t = c.get("text") or ""
+        if not t:
+            continue
+        if total + len(t) > max_context_chars:
+            t = t[: max(0, max_context_chars - total)]
+        if t:
+            buf_parts.append(t)
+            used.append(c)
+            total += len(t)
+        if total >= max_context_chars:
+            break
+    return "\n\n---\n\n".join(buf_parts), used