AI_Agent_Final_V3

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 31, 2025

Commit

d83a16b

verified ·

1 Parent(s): a6f0418

Update api/rag_engine.py

Browse files

Files changed (1) hide show

api/rag_engine.py +46 -6

api/rag_engine.py CHANGED Viewed

@@ -2,7 +2,7 @@
 """
 RAG engine:
 - build_rag_chunks_from_file(path, doc_type) -> List[chunk]
-- retrieve_relevant_chunks(query, chunks) -> (context_text, used_chunks)
 Chunk format (MVP):
 {
@@ -11,11 +11,17 @@ Chunk format (MVP):
   "section": str,
   "doc_type": str
 }
 """
 import os
 import re
-from typing import Dict, List, Tuple
 from pypdf import PdfReader
 from docx import Document
@@ -61,14 +67,12 @@ def _truncate_to_tokens(text: str, max_tokens: int, model: str = "") -> str:
     tk = _safe_import_tiktoken()
     if tk is None:
-        # approximate by chars
         total = _approx_tokens(text)
         if total <= max_tokens:
             return text
         ratio = max_tokens / max(1, total)
         cut = max(50, min(len(text), int(len(text) * ratio)))
         s = text[:cut]
-        # tighten
         while _approx_tokens(s) > max_tokens and len(s) > 50:
             s = s[: int(len(s) * 0.9)]
         return s
@@ -136,6 +140,13 @@ def _file_label(path: str) -> str:
     return os.path.basename(path) if path else "uploaded_file"
 # ----------------------------
 # Parsers
 # ----------------------------
@@ -234,12 +245,20 @@ def retrieve_relevant_chunks(
     chunk_token_limit: int = RAG_CHUNK_TOKEN_LIMIT,
     max_context_tokens: int = RAG_CONTEXT_TOKEN_LIMIT,
     model_for_tokenizer: str = "",
 ) -> Tuple[str, List[Dict]]:
     """
     Deterministic lightweight retrieval (no embeddings):
     - score by token overlap
     - return top-k chunks concatenated as context
     Hard limits implemented:
     - top-k <= 4 (default)
     - each chunk <= 500 tokens
@@ -249,6 +268,28 @@ def retrieve_relevant_chunks(
     if not query or not chunks:
         return "", []
     # ✅ Short query gate: avoid wasting time on RAG for greetings / tiny inputs
     q_tokens_list = re.findall(r"[a-zA-Z0-9]+", query.lower())
     if (len(q_tokens_list) < 3) and (len(query) < 20):
@@ -259,7 +300,7 @@ def retrieve_relevant_chunks(
         return "", []
     scored: List[Tuple[int, Dict]] = []
-    for c in chunks:
         text = (c.get("text") or "")
         if not text:
             continue
@@ -300,7 +341,6 @@ def retrieve_relevant_chunks(
         # legacy char cap safety (keep your previous behavior as extra guard)
         if max_context_chars and max_context_chars > 0:
-            # approximate: don't let total string blow up
             current_chars = sum(len(x) for x in truncated_texts)
             if current_chars + len(t) > max_context_chars:
                 t = t[: max(0, max_context_chars - current_chars)]

 """
 RAG engine:
 - build_rag_chunks_from_file(path, doc_type) -> List[chunk]
+- retrieve_relevant_chunks(query, chunks, ...) -> (context_text, used_chunks)
 Chunk format (MVP):
 {
   "section": str,
   "doc_type": str
 }
+✅ Update in this version:
+- retrieve_relevant_chunks now supports optional scoping:
+  - allowed_source_files: Optional[List[str]]  (match by basename)
+  - allowed_doc_types: Optional[List[str]]
+- Scoping happens BEFORE scoring, so refs returned are guaranteed to be the true used chunks.
 """
 import os
 import re
+from typing import Dict, List, Tuple, Optional
 from pypdf import PdfReader
 from docx import Document
     tk = _safe_import_tiktoken()
     if tk is None:
         total = _approx_tokens(text)
         if total <= max_tokens:
             return text
         ratio = max_tokens / max(1, total)
         cut = max(50, min(len(text), int(len(text) * ratio)))
         s = text[:cut]
         while _approx_tokens(s) > max_tokens and len(s) > 50:
             s = s[: int(len(s) * 0.9)]
         return s
     return os.path.basename(path) if path else "uploaded_file"
+def _basename(x: str) -> str:
+    try:
+        return os.path.basename(x or "")
+    except Exception:
+        return x or ""
 # ----------------------------
 # Parsers
 # ----------------------------
     chunk_token_limit: int = RAG_CHUNK_TOKEN_LIMIT,
     max_context_tokens: int = RAG_CONTEXT_TOKEN_LIMIT,
     model_for_tokenizer: str = "",
+    # ✅ NEW: scoping controls
+    allowed_source_files: Optional[List[str]] = None,
+    allowed_doc_types: Optional[List[str]] = None,
 ) -> Tuple[str, List[Dict]]:
     """
     Deterministic lightweight retrieval (no embeddings):
     - score by token overlap
     - return top-k chunks concatenated as context
+    ✅ Scoping:
+    - If allowed_source_files provided: only consider chunks whose source_file basename is in the allowlist
+    - If allowed_doc_types provided: only consider chunks whose doc_type is in the allowlist
+    Scoping is applied BEFORE scoring; returned used_chunks are the true sources for refs.
     Hard limits implemented:
     - top-k <= 4 (default)
     - each chunk <= 500 tokens
     if not query or not chunks:
         return "", []
+    # ----------------------------
+    # ✅ Apply scoping BEFORE scoring
+    # ----------------------------
+    filtered = chunks or []
+    if allowed_source_files:
+        allow_files = {_basename(str(x)).strip() for x in allowed_source_files if str(x).strip()}
+        if allow_files:
+            filtered = [
+                c
+                for c in filtered
+                if _basename(str(c.get("source_file", ""))).strip() in allow_files
+            ]
+    if allowed_doc_types:
+        allow_dt = {str(x).strip() for x in allowed_doc_types if str(x).strip()}
+        if allow_dt:
+            filtered = [c for c in filtered if str(c.get("doc_type", "")).strip() in allow_dt]
+    if not filtered:
+        return "", []
     # ✅ Short query gate: avoid wasting time on RAG for greetings / tiny inputs
     q_tokens_list = re.findall(r"[a-zA-Z0-9]+", query.lower())
     if (len(q_tokens_list) < 3) and (len(query) < 20):
         return "", []
     scored: List[Tuple[int, Dict]] = []
+    for c in filtered:
         text = (c.get("text") or "")
         if not text:
             continue
         # legacy char cap safety (keep your previous behavior as extra guard)
         if max_context_chars and max_context_chars > 0:
             current_chars = sum(len(x) for x in truncated_texts)
             if current_chars + len(t) > max_context_chars:
                 t = t[: max(0, max_context_chars - current_chars)]