AI_Agent_Final_V2

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 20, 2025

Commit

6e941de

verified ·

1 Parent(s): 682585f

Update api/rag_engine.py

Browse files

Files changed (1) hide show

api/rag_engine.py +35 -23

api/rag_engine.py CHANGED Viewed

@@ -8,7 +8,9 @@ Chunk format (MVP):
 {
   "text": str,
   "source_file": str,
-  "section": str
 }
 """
@@ -20,14 +22,11 @@ from pypdf import PdfReader
 from docx import Document
 from pptx import Presentation
-# IMPORTANT: now under api/
-from api.syllabus_utils import parse_pptx_slides  # optional reuse
-from api.config import DEFAULT_COURSE_TOPICS
-# ----------------------------
-# Helpers
-# ----------------------------
 def _clean_text(s: str) -> str:
     s = (s or "").replace("\r", "\n")
     s = re.sub(r"\n{3,}", "\n\n", s)
@@ -36,9 +35,9 @@ def _clean_text(s: str) -> str:
 def _split_into_chunks(text: str, max_chars: int = 1400) -> List[str]:
     """
-    Simple deterministic chunker:
     - split by blank lines
-    - then pack into <= max_chars
     """
     text = _clean_text(text)
     if not text:
@@ -69,14 +68,18 @@ def _file_label(path: str) -> str:
     return os.path.basename(path) if path else "uploaded_file"
 # ----------------------------
 # Parsers
 # ----------------------------
 def _parse_pdf_to_text(path: str) -> List[Tuple[str, str]]:
-    """
-    Returns list of (section_label, text)
-    section_label uses page numbers.
-    """
     reader = PdfReader(path)
     out: List[Tuple[str, str]] = []
     for i, page in enumerate(reader.pages):
@@ -125,7 +128,6 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
     ext = os.path.splitext(path)[1].lower()
     source_file = _file_label(path)
-    # Parse into (section, text blocks)
     sections: List[Tuple[str, str]] = []
     try:
         if ext == ".pdf":
@@ -138,7 +140,6 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
             with open(path, "r", encoding="utf-8", errors="ignore") as f:
                 sections = [("text", _clean_text(f.read()))]
         else:
-            # Unsupported file type: return empty (safe)
             print(f"[rag_engine] unsupported file type: {ext}")
             return []
     except Exception as e:
@@ -147,14 +148,16 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
     chunks: List[Dict] = []
     for section, text in sections:
-        # Split section text into smaller chunks
         for j, piece in enumerate(_split_into_chunks(text), start=1):
             chunks.append(
                 {
                     "text": piece,
                     "source_file": source_file,
                     "section": f"{section}#{j}",
                     "doc_type": doc_type,
                 }
             )
@@ -162,33 +165,42 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
 def retrieve_relevant_chunks(
-    query: str, chunks: List[Dict], k: int = 4, max_context_chars: int = 2800
 ) -> Tuple[str, List[Dict]]:
     """
-    Deterministic lightweight retrieval (no embeddings):
-    - score by token overlap (very fast)
     - return top-k chunks concatenated as context
     """
     query = _clean_text(query)
     if not query or not chunks:
         return "", []
-    q_tokens = set(re.findall(r"[a-zA-Z0-9]+", query.lower()))
     if not q_tokens:
         return "", []
     scored: List[Tuple[int, Dict]] = []
     for c in chunks:
-        text = (c.get("text") or "")
-        t_tokens = set(re.findall(r"[a-zA-Z0-9]+", text.lower()))
         score = len(q_tokens.intersection(t_tokens))
         if score > 0:
             scored.append((score, c))
     scored.sort(key=lambda x: x[0], reverse=True)
     top = [c for _, c in scored[:k]]
-    # Build context text
     buf_parts: List[str] = []
     used: List[Dict] = []
     total = 0

 {
   "text": str,
   "source_file": str,
+  "section": str,
+  "doc_type": str,
+  "_tokens": frozenset[str]   # ✅ precomputed for fast retrieval (in-memory)
 }
 """
 from docx import Document
 from pptx import Presentation
+# precompiled regex for speed
+_WORD_RE = re.compile(r"[a-zA-Z0-9]+")
+_WS_RE = re.compile(r"\s+")
 def _clean_text(s: str) -> str:
     s = (s or "").replace("\r", "\n")
     s = re.sub(r"\n{3,}", "\n\n", s)
 def _split_into_chunks(text: str, max_chars: int = 1400) -> List[str]:
     """
+    Deterministic chunker:
     - split by blank lines
+    - pack into <= max_chars
     """
     text = _clean_text(text)
     if not text:
     return os.path.basename(path) if path else "uploaded_file"
+def _tokenize(s: str) -> frozenset:
+    # normalize whitespace first to reduce regex work slightly
+    s = _WS_RE.sub(" ", (s or "").lower()).strip()
+    if not s:
+        return frozenset()
+    return frozenset(_WORD_RE.findall(s))
 # ----------------------------
 # Parsers
 # ----------------------------
 def _parse_pdf_to_text(path: str) -> List[Tuple[str, str]]:
     reader = PdfReader(path)
     out: List[Tuple[str, str]] = []
     for i, page in enumerate(reader.pages):
     ext = os.path.splitext(path)[1].lower()
     source_file = _file_label(path)
     sections: List[Tuple[str, str]] = []
     try:
         if ext == ".pdf":
             with open(path, "r", encoding="utf-8", errors="ignore") as f:
                 sections = [("text", _clean_text(f.read()))]
         else:
             print(f"[rag_engine] unsupported file type: {ext}")
             return []
     except Exception as e:
     chunks: List[Dict] = []
     for section, text in sections:
         for j, piece in enumerate(_split_into_chunks(text), start=1):
+            # ✅ precompute tokens once
+            toks = _tokenize(piece)
             chunks.append(
                 {
                     "text": piece,
                     "source_file": source_file,
                     "section": f"{section}#{j}",
                     "doc_type": doc_type,
+                    "_tokens": toks,
                 }
             )
 def retrieve_relevant_chunks(
+    query: str,
+    chunks: List[Dict],
+    k: int = 3,                 # ✅ smaller default = faster + less prompt
+    max_context_chars: int = 2200,  # ✅ smaller default = faster
 ) -> Tuple[str, List[Dict]]:
     """
+    Fast deterministic retrieval:
+    - score by token overlap using precomputed chunk tokens
     - return top-k chunks concatenated as context
     """
     query = _clean_text(query)
     if not query or not chunks:
         return "", []
+    q_tokens = _tokenize(query)
     if not q_tokens:
         return "", []
     scored: List[Tuple[int, Dict]] = []
     for c in chunks:
+        t_tokens = c.get("_tokens")
+        if not t_tokens:
+            # fallback if older chunks exist without tokens
+            t_tokens = _tokenize(c.get("text") or "")
+            c["_tokens"] = t_tokens
         score = len(q_tokens.intersection(t_tokens))
         if score > 0:
             scored.append((score, c))
+    if not scored:
+        return "", []
     scored.sort(key=lambda x: x[0], reverse=True)
     top = [c for _, c in scored[:k]]
     buf_parts: List[str] = []
     used: List[Dict] = []
     total = 0