Spaces:

raviix46
/

Email-Rag-Prototype

Sleeping

App Files Files Community

raviix46 commited on Nov 13, 2025

Commit

80521e2

verified ·

1 Parent(s): cf88796

Update rag_retrieval.py

Browse files

Files changed (1) hide show

rag_retrieval.py +86 -2

rag_retrieval.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import json
 import time
 import uuid
 import numpy as np
 from rag_config import RUNS_DIR, ROOT_DIR
 from rag_data import chunks, bm25, embeddings, sem_model, THREAD_OPTIONS
@@ -9,11 +11,44 @@ from rag_sessions import get_session
 RUNS_DIR.mkdir(exist_ok=True)
 def rewrite_query(user_text: str, session: dict) -> str:
-    """Very simple rewrite: attach thread context."""
     tid = session["thread_id"]
-    return f"In thread {tid}, answer this question: {user_text}"
 def retrieve_chunks(rewrite: str, session: dict, search_outside_thread: bool):
@@ -61,6 +96,9 @@ def retrieve_chunks(rewrite: str, session: dict, search_outside_thread: bool):
             "score_sem": float(sem_norm[order][local_rank]),
             "score_combined": float(combined[order][local_rank]),
             "text": c["text"],
         })
     return retrieved
@@ -124,6 +162,52 @@ def build_answer(user_text: str, rewrite: str, retrieved):
     return answer, citations
 def log_trace(session_id: str, user_text: str, rewrite: str, retrieved, answer, citations):
     trace_path = RUNS_DIR / "trace.jsonl"

+# rag_retrieval.py
 import json
 import time
 import uuid
 import numpy as np
+import re
 from rag_config import RUNS_DIR, ROOT_DIR
 from rag_data import chunks, bm25, embeddings, sem_model, THREAD_OPTIONS
 RUNS_DIR.mkdir(exist_ok=True)
+# --- simple regex patterns for entities ---
+FILE_PAT = re.compile(r"\b[\w\-.]+\.(?:pdf|docx?|xls[xm]?|pptx?|txt)\b", re.IGNORECASE)
+EMAIL_PAT = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b")
+AMOUNT_PAT = re.compile(r"\b(?:\$|USD\s*)?\d{1,3}(?:,\d{3})*(?:\.\d+)?\b")
+DATE_PAT = re.compile(r"\b\d{1,2}/\d{1,2}/\d{2,4}\b")  # very simple date pattern
 def rewrite_query(user_text: str, session: dict) -> str:
+    """
+    Rewrite user query by injecting thread ID and a light summary
+    of known entities from entity_memory.
+    """
     tid = session["thread_id"]
+    mem = session.get("entity_memory") or {}
+    key_bits = []
+    people = mem.get("people") or []
+    if people:
+        key_bits.append(f"people: {', '.join(people[:3])}")
+    files = mem.get("files") or []
+    if files:
+        key_bits.append(f"files: {', '.join(files[:3])}")
+    amounts = mem.get("amounts") or []
+    if amounts:
+        key_bits.append(f"amounts: {', '.join(amounts[:3])}")
+    dates = mem.get("dates") or []
+    if dates:
+        key_bits.append(f"dates: {', '.join(dates[:3])}")
+    context_str = ""
+    if key_bits:
+        context_str = "Known entities in this thread: " + "; ".join(key_bits) + ". "
+    return f"In thread {tid}, {context_str}answer this question: {user_text}"
 def retrieve_chunks(rewrite: str, session: dict, search_outside_thread: bool):
             "score_sem": float(sem_norm[order][local_rank]),
             "score_combined": float(combined[order][local_rank]),
             "text": c["text"],
+            # carry over from/to so entity extraction can see people
+            "from_addr": c.get("from"),
+            "to_addr": c.get("to"),
         })
     return retrieved
     return answer, citations
+def extract_entities_for_turn(user_text: str, retrieved):
+    """
+    Extract simple entities from this turn:
+    - people: email addresses from chunks + question
+    - files: filenames like something.pdf
+    - amounts: numbers / $ amounts
+    - dates: simple date patterns
+    """
+    texts = [user_text] + [r["text"] for r in retrieved]
+    people = set()
+    files = set()
+    amounts = set()
+    dates = set()
+    # from/to emails are good 'people' proxies
+    for r in retrieved:
+        for field in ("from_addr", "to_addr"):
+            val = r.get(field)
+            if not val:
+                continue
+            for email_match in EMAIL_PAT.findall(val):
+                people.add(email_match)
+    # scan all texts
+    for t in texts:
+        for m in EMAIL_PAT.findall(t):
+            people.add(m)
+        for m in FILE_PAT.findall(t):
+            files.add(m)
+        for m in AMOUNT_PAT.findall(t):
+            amounts.add(m)
+        for m in DATE_PAT.findall(t):
+            dates.add(m)
+    entities = {
+        "people": sorted(people),
+        "amounts": sorted(amounts),
+        "files": sorted(files),
+        "dates": sorted(dates),
+    }
+    # Strip empty categories
+    entities = {k: v for k, v in entities.items() if v}
+    return entities
 def log_trace(session_id: str, user_text: str, rewrite: str, retrieved, answer, citations):
     trace_path = RUNS_DIR / "trace.jsonl"