AI_Agent_Final

Sleeping

App Files Files Community

SarahXia0405 commited on Dec 21, 2025

Commit

3268902

verified ·

1 Parent(s): 037dc25

Update api/rag_engine.py

Browse files

Files changed (1) hide show

api/rag_engine.py +121 -17

api/rag_engine.py CHANGED Viewed

@@ -21,6 +21,77 @@ from pypdf import PdfReader
 from docx import Document
 from pptx import Presentation
 # ----------------------------
 # Helpers
 # ----------------------------
@@ -157,19 +228,22 @@ def build_rag_chunks_from_file(path: str, doc_type: str) -> List[Dict]:
 def retrieve_relevant_chunks(
     query: str,
     chunks: List[Dict],
-    k: int = 1,
-    max_context_chars: int = 600,
     min_score: int = 6,
 ) -> Tuple[str, List[Dict]]:
     """
     Deterministic lightweight retrieval (no embeddings):
     - score by token overlap
     - return top-k chunks concatenated as context
-    Speed improvements:
-    - short/generic queries won't trigger RAG
-    - higher min_score prevents accidental triggers
-    - smaller max_context_chars reduces LLM prompt size
     """
     query = _clean_text(query)
     if not query or not chunks:
@@ -198,22 +272,52 @@ def retrieve_relevant_chunks(
         return "", []
     scored.sort(key=lambda x: x[0], reverse=True)
     top = [c for _, c in scored[:k]]
-    buf_parts: List[str] = []
     used: List[Dict] = []
-    total = 0
     for c in top:
-        t = c.get("text") or ""
         if not t:
             continue
-        if total + len(t) > max_context_chars:
-            t = t[: max(0, max_context_chars - total)]
-        if t:
-            buf_parts.append(t)
-            used.append(c)
-            total += len(t)
-        if total >= max_context_chars:
             break
-    return "\n\n---\n\n".join(buf_parts), used

 from docx import Document
 from pptx import Presentation
+# ============================
+# Token helpers (optional tiktoken)
+# ============================
+def _safe_import_tiktoken():
+    try:
+        import tiktoken  # type: ignore
+        return tiktoken
+    except Exception:
+        return None
+def _approx_tokens(text: str) -> int:
+    if not text:
+        return 0
+    return max(1, int(len(text) / 4))
+def _count_text_tokens(text: str, model: str = "") -> int:
+    tk = _safe_import_tiktoken()
+    if tk is None:
+        return _approx_tokens(text)
+    try:
+        enc = tk.encoding_for_model(model) if model else tk.get_encoding("cl100k_base")
+    except Exception:
+        enc = tk.get_encoding("cl100k_base")
+    return len(enc.encode(text or ""))
+def _truncate_to_tokens(text: str, max_tokens: int, model: str = "") -> str:
+    """
+    Deterministic truncation. Uses tiktoken if available; otherwise approximates by char ratio.
+    """
+    if not text:
+        return text
+    tk = _safe_import_tiktoken()
+    if tk is None:
+        # approximate by chars
+        total = _approx_tokens(text)
+        if total <= max_tokens:
+            return text
+        ratio = max_tokens / max(1, total)
+        cut = max(50, min(len(text), int(len(text) * ratio)))
+        s = text[:cut]
+        # tighten
+        while _approx_tokens(s) > max_tokens and len(s) > 50:
+            s = s[: int(len(s) * 0.9)]
+        return s
+    try:
+        enc = tk.encoding_for_model(model) if model else tk.get_encoding("cl100k_base")
+    except Exception:
+        enc = tk.get_encoding("cl100k_base")
+    ids = enc.encode(text or "")
+    if len(ids) <= max_tokens:
+        return text
+    return enc.decode(ids[:max_tokens])
+# ============================
+# RAG hard limits
+# ============================
+RAG_TOPK_LIMIT = 4
+RAG_CHUNK_TOKEN_LIMIT = 500
+RAG_CONTEXT_TOKEN_LIMIT = 2000  # 4 * 500
 # ----------------------------
 # Helpers
 # ----------------------------
 def retrieve_relevant_chunks(
     query: str,
     chunks: List[Dict],
+    k: int = RAG_TOPK_LIMIT,
+    max_context_chars: int = 600,  # kept for backward compatibility (still used as a safety cap)
     min_score: int = 6,
+    chunk_token_limit: int = RAG_CHUNK_TOKEN_LIMIT,
+    max_context_tokens: int = RAG_CONTEXT_TOKEN_LIMIT,
+    model_for_tokenizer: str = "",
 ) -> Tuple[str, List[Dict]]:
     """
     Deterministic lightweight retrieval (no embeddings):
     - score by token overlap
     - return top-k chunks concatenated as context
+    Hard limits implemented:
+    - top-k <= 4 (default)
+    - each chunk <= 500 tokens
+    - total context <= 2000 tokens (default)
     """
     query = _clean_text(query)
     if not query or not chunks:
         return "", []
     scored.sort(key=lambda x: x[0], reverse=True)
+    # hard cap k
+    k = min(int(k or RAG_TOPK_LIMIT), RAG_TOPK_LIMIT)
     top = [c for _, c in scored[:k]]
+    # truncate each chunk to <= chunk_token_limit
     used: List[Dict] = []
+    truncated_texts: List[str] = []
+    total_tokens = 0
     for c in top:
+        raw = c.get("text") or ""
+        if not raw:
+            continue
+        t = _truncate_to_tokens(raw, max_tokens=chunk_token_limit, model=model_for_tokenizer)
+        # enforce total context tokens cap
+        t_tokens = _count_text_tokens(t, model=model_for_tokenizer)
+        if total_tokens + t_tokens > max_context_tokens:
+            remaining = max_context_tokens - total_tokens
+            if remaining <= 0:
+                break
+            t = _truncate_to_tokens(t, max_tokens=remaining, model=model_for_tokenizer)
+            t_tokens = _count_text_tokens(t, model=model_for_tokenizer)
+        # legacy char cap safety (keep your previous behavior as extra guard)
+        if max_context_chars and max_context_chars > 0:
+            # approximate: don't let total string blow up
+            current_chars = sum(len(x) for x in truncated_texts)
+            if current_chars + len(t) > max_context_chars:
+                t = t[: max(0, max_context_chars - current_chars)]
+        t = _clean_text(t)
         if not t:
             continue
+        truncated_texts.append(t)
+        used.append(c)
+        total_tokens += t_tokens
+        if total_tokens >= max_context_tokens:
             break
+    if not truncated_texts:
+        return "", []
+    context = "\n\n---\n\n".join(truncated_texts)
+    return context, used