Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 20, 2025

Commit

d469e88

verified ·

1 Parent(s): b417daa

Update src/rag_engine.py

Browse files

added support for new AcronymManager.py file

Files changed (1) hide show

src/rag_engine.py +22 -10

src/rag_engine.py CHANGED Viewed

@@ -13,6 +13,7 @@ from sentence_transformers import CrossEncoder
 # --- CUSTOM CORE IMPORTS ---
 from core.ParagraphChunker import ParagraphChunker
 from core.TokenChunker import TokenChunker
 # --- CONFIGURATION ---
 CHROMA_PATH = "chroma_db"
@@ -180,17 +181,20 @@ def process_and_add_text(text: str, source_name: str, username: str) -> Tuple[bo
         return False, f"Error: {str(e)}"
 def ingest_file(file_path: str, username: str, strategy: str = "paragraph") -> Tuple[bool, str]:
-    """
-    The High-Level Bridge: Takes a file path, chunks it, and saves to Vector DB.
-    Replaces the old 'process_and_add_document'.
-    """
     try:
-        # 1. Chunk the file using the new engine
         docs = process_file(file_path, chunking_strategy=strategy)
         if not docs:
             return False, "No valid chunks generated from file."
         # 2. Add to Chroma DB
         user_db_path = os.path.join(CHROMA_PATH, username)
         emb_fn = get_embedding_func()
@@ -205,24 +209,32 @@ def ingest_file(file_path: str, username: str, strategy: str = "paragraph") -> T
         return False, f"System Error: {str(e)}"
 def search_knowledge_base(query: str, username: str, k: int = 10, final_k: int = 4) -> List[Document]:
-    """Retrieves top K chunks, then uses Cross-Encoder to re-rank them."""
     user_db_path = os.path.join(CHROMA_PATH, username)
     if not os.path.exists(user_db_path):
         return []
     try:
-        # 1. Vector Retrieval
         emb_fn = get_embedding_func()
         db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-        results = db.similarity_search_with_relevance_scores(query, k=k)
         if not results:
             return []
-        # 2. Reranking
         candidate_docs = [doc for doc, _ in results]
         candidate_texts = [doc.page_content for doc in candidate_docs]
-        pairs = [[query, text] for text in candidate_texts]
         reranker = get_rerank_model()
         scores = reranker.predict(pairs)

 # --- CUSTOM CORE IMPORTS ---
 from core.ParagraphChunker import ParagraphChunker
 from core.TokenChunker import TokenChunker
+from core.AcronymManager import AcronymManager
 # --- CONFIGURATION ---
 CHROMA_PATH = "chroma_db"
         return False, f"Error: {str(e)}"
 def ingest_file(file_path: str, username: str, strategy: str = "paragraph") -> Tuple[bool, str]:
     try:
+        # 1. Chunk the file
         docs = process_file(file_path, chunking_strategy=strategy)
         if not docs:
             return False, "No valid chunks generated from file."
+        # --- ACRONYM SCANNING ---
+        # We scan the raw text of the chunks to learn new definitions
+        acronym_mgr = AcronymManager()
+        for doc in docs:
+            acronym_mgr.scan_text_for_acronyms(doc.page_content)
+        # -----------------------------
         # 2. Add to Chroma DB
         user_db_path = os.path.join(CHROMA_PATH, username)
         emb_fn = get_embedding_func()
         return False, f"System Error: {str(e)}"
 def search_knowledge_base(query: str, username: str, k: int = 10, final_k: int = 4) -> List[Document]:
     user_db_path = os.path.join(CHROMA_PATH, username)
     if not os.path.exists(user_db_path):
         return []
     try:
+        # --- NEW: QUERY EXPANSION ---
+        acronym_mgr = AcronymManager()
+        expanded_query = acronym_mgr.expand_query(query)
+        if expanded_query != query:
+            logger.info(f"Query Expanded: '{query}' -> '{expanded_query}'")
+        else:
+            expanded_query = query
+        # ----------------------------
+        # 1. Vector Retrieval (Use expanded_query instead of query)
         emb_fn = get_embedding_func()
         db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
+        results = db.similarity_search_with_relevance_scores(expanded_query, k=k) # <--- UPDATED VAR
         if not results:
             return []
+        # 2. Reranking (Pass expanded_query here too)
         candidate_docs = [doc for doc, _ in results]
         candidate_texts = [doc.page_content for doc in candidate_docs]
+        pairs = [[expanded_query, text] for text in candidate_texts] # <--- UPDATED VAR
         reranker = get_rerank_model()
         scores = reranker.predict(pairs)