AI_Agent_Final

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 20, 2025

Commit

82b3136

verified ·

1 Parent(s): d4f2575

Update api/rag_engine.py

Browse files

Files changed (1) hide show

api/rag_engine.py +40 -37

api/rag_engine.py CHANGED Viewed

@@ -9,8 +9,7 @@ Chunk format (MVP):
   "text": str,
   "source_file": str,
   "section": str,
-  "doc_type": str,
-  "_tokens": frozenset[str]   # ✅ precomputed for fast retrieval (in-memory)
 }
 """
@@ -22,11 +21,10 @@ from pypdf import PdfReader
 from docx import Document
 from pptx import Presentation
-# precompiled regex for speed
-_WORD_RE = re.compile(r"[a-zA-Z0-9]+")
-_WS_RE = re.compile(r"\s+")
 def _clean_text(s: str) -> str:
     s = (s or "").replace("\r", "\n")
     s = re.sub(r"\n{3,}", "\n\n", s)
@@ -35,9 +33,9 @@ def _clean_text(s: str) -> str:
 def _split_into_chunks(text: str, max_chars: int = 1400) -> List[str]:
     """
-    Deterministic chunker:
     - split by blank lines
-    - pack into <= max_chars
     """
     text = _clean_text(text)
     if not text:
@@ -68,18 +66,14 @@ def _file_label(path: str) -> str:
     return os.path.basename(path) if path else "uploaded_file"
-def _tokenize(s: str) -> frozenset:
-    # normalize whitespace first to reduce regex work slightly
-    s = _WS_RE.sub(" ", (s or "").lower()).strip()
-    if not s:
-        return frozenset()
-    return frozenset(_WORD_RE.findall(s))
 # ----------------------------
 # Parsers
 # ----------------------------
 def _parse_pdf_to_text(path: str) -> List[Tuple[str, str]]:
     reader = PdfReader(path)
     out: List[Tuple[str, str]] = []
     for i, page in enumerate(reader.pages):
@@ -149,15 +143,12 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
     chunks: List[Dict] = []
     for section, text in sections:
         for j, piece in enumerate(_split_into_chunks(text), start=1):
-            # ✅ precompute tokens once
-            toks = _tokenize(piece)
             chunks.append(
                 {
                     "text": piece,
                     "source_file": source_file,
                     "section": f"{section}#{j}",
                     "doc_type": doc_type,
-                    "_tokens": toks,
                 }
             )
@@ -167,30 +158,30 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
 def retrieve_relevant_chunks(
     query: str,
     chunks: List[Dict],
-    k: int = 3,                 # ✅ smaller default = faster + less prompt
-    max_context_chars: int = 2200,  # ✅ smaller default = faster
 ) -> Tuple[str, List[Dict]]:
     """
-    Fast deterministic retrieval:
-    - score by token overlap using precomputed chunk tokens
-    - return top-k chunks concatenated as context
     """
     query = _clean_text(query)
     if not query or not chunks:
         return "", []
-    q_tokens = _tokenize(query)
     if not q_tokens:
         return "", []
     scored: List[Tuple[int, Dict]] = []
     for c in chunks:
-        t_tokens = c.get("_tokens")
-        if not t_tokens:
-            # fallback if older chunks exist without tokens
-            t_tokens = _tokenize(c.get("text") or "")
-            c["_tokens"] = t_tokens
         score = len(q_tokens.intersection(t_tokens))
         if score > 0:
             scored.append((score, c))
@@ -199,6 +190,12 @@ def retrieve_relevant_chunks(
         return "", []
     scored.sort(key=lambda x: x[0], reverse=True)
     top = [c for _, c in scored[:k]]
     buf_parts: List[str] = []
@@ -208,12 +205,18 @@ def retrieve_relevant_chunks(
         t = c.get("text") or ""
         if not t:
             continue
-        if total + len(t) > max_context_chars:
-            t = t[: max(0, max_context_chars - total)]
-        if t:
-            buf_parts.append(t)
-            used.append(c)
-            total += len(t)
         if total >= max_context_chars:
             break

   "text": str,
   "source_file": str,
   "section": str,
+  "doc_type": str
 }
 """
 from docx import Document
 from pptx import Presentation
+# ----------------------------
+# Helpers
+# ----------------------------
 def _clean_text(s: str) -> str:
     s = (s or "").replace("\r", "\n")
     s = re.sub(r"\n{3,}", "\n\n", s)
 def _split_into_chunks(text: str, max_chars: int = 1400) -> List[str]:
     """
+    Simple deterministic chunker:
     - split by blank lines
+    - then pack into <= max_chars
     """
     text = _clean_text(text)
     if not text:
     return os.path.basename(path) if path else "uploaded_file"
 # ----------------------------
 # Parsers
 # ----------------------------
 def _parse_pdf_to_text(path: str) -> List[Tuple[str, str]]:
+    """
+    Returns list of (section_label, text)
+    section_label uses page numbers.
+    """
     reader = PdfReader(path)
     out: List[Tuple[str, str]] = []
     for i, page in enumerate(reader.pages):
     chunks: List[Dict] = []
     for section, text in sections:
         for j, piece in enumerate(_split_into_chunks(text), start=1):
             chunks.append(
                 {
                     "text": piece,
                     "source_file": source_file,
                     "section": f"{section}#{j}",
                     "doc_type": doc_type,
                 }
             )
 def retrieve_relevant_chunks(
     query: str,
     chunks: List[Dict],
+    k: int = 2,
+    max_context_chars: int = 1200,
+    min_score: int = 3,
 ) -> Tuple[str, List[Dict]]:
     """
+    Deterministic lightweight retrieval (no embeddings):
+    - score by token overlap (fast)
+    - ONLY include context when overlap score is meaningful (>= min_score)
+    - keep context short to reduce LLM latency
     """
     query = _clean_text(query)
     if not query or not chunks:
         return "", []
+    q_tokens = set(re.findall(r"[a-zA-Z0-9]+", query.lower()))
     if not q_tokens:
         return "", []
     scored: List[Tuple[int, Dict]] = []
     for c in chunks:
+        text = (c.get("text") or "")
+        if not text:
+            continue
+        t_tokens = set(re.findall(r"[a-zA-Z0-9]+", text.lower()))
         score = len(q_tokens.intersection(t_tokens))
         if score > 0:
             scored.append((score, c))
         return "", []
     scored.sort(key=lambda x: x[0], reverse=True)
+    # 如果最相关的都很弱，就别塞 RAG（避免白白变慢）
+    best_score = scored[0][0]
+    if best_score < min_score:
+        return "", []
     top = [c for _, c in scored[:k]]
     buf_parts: List[str] = []
         t = c.get("text") or ""
         if not t:
             continue
+        remaining = max_context_chars - total
+        if remaining <= 0:
+            break
+        if len(t) > remaining:
+            t = t[:remaining]
+        buf_parts.append(t)
+        used.append(c)
+        total += len(t)
         if total >= max_context_chars:
             break