Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 21, 2025

Commit

ff3310f

verified ·

1 Parent(s): 4de416e

Update src/rag_engine.py

Browse files

removed chroma support and added pinecone

Files changed (1) hide show

src/rag_engine.py +89 -116

src/rag_engine.py CHANGED Viewed

@@ -4,22 +4,23 @@ import logging
 from typing import List, Literal, Tuple
 # --- LANGCHAIN & DB IMPORTS ---
-from langchain_chroma import Chroma
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_core.documents import Document
 from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
 from sentence_transformers import CrossEncoder
 # --- CUSTOM CORE IMPORTS ---
 from core.ParagraphChunker import ParagraphChunker
 from core.TokenChunker import TokenChunker
 from core.AcronymManager import AcronymManager
 # --- CONFIGURATION ---
-CHROMA_PATH = "chroma_db"
 UPLOAD_DIR = "source_documents"
 EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
 RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
 # Configure Logging
 logging.basicConfig(level=logging.INFO)
@@ -133,7 +134,7 @@ def process_file(
         logger.warning(f"Unsupported file extension: {file_extension}")
         return []
-# --- PART 2: DATABASE & FILE MANAGEMENT (The Old Stable System) ---
 def save_uploaded_file(uploaded_file, username: str = "default") -> str:
     """Saves a StreamlitUploadedFile to disk so the loaders can read it."""
@@ -144,102 +145,99 @@ def save_uploaded_file(uploaded_file, username: str = "default") -> str:
         with open(file_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
-        logger.info(f"File saved: {file_path}")
         return file_path
     except Exception as e:
         logger.error(f"Error saving file: {e}")
         return None
-def process_and_add_text(text: str, source_name: str, username: str) -> Tuple[bool, str]:
-    """
-    Ingests raw text string (e.g., from the Flattener tool) directly into Chroma.
-    """
     try:
-        user_db_path = os.path.join(CHROMA_PATH, username)
-        emb_fn = get_embedding_func()
-        # Initialize DB
-        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-        # Create a Document object directly
         doc = Document(
             page_content=text,
-            metadata={
-                "source": source_name,
-                "strategy": "flattened_text",
-                "file_type": "generated"
-            }
         )
-        # Add single document
-        db.add_documents([doc])
-        return True, f"Successfully indexed flattened text: {source_name}"
     except Exception as e:
-        logger.error(f"Error indexing raw text: {e}")
-        return False, f"Error: {str(e)}"
-def ingest_file(file_path: str, username: str, strategy: str = "paragraph") -> Tuple[bool, str]:
     try:
-        # 1. Chunk the file
         docs = process_file(file_path, chunking_strategy=strategy)
-        if not docs:
-            return False, "No valid chunks generated from file."
-        # --- ACRONYM SCANNING ---
-        # We scan the raw text of the chunks to learn new definitions
         acronym_mgr = AcronymManager()
         for doc in docs:
             acronym_mgr.scan_text_for_acronyms(doc.page_content)
-        # -----------------------------
-        # 2. Add to Chroma DB
-        user_db_path = os.path.join(CHROMA_PATH, username)
         emb_fn = get_embedding_func()
-        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-        db.add_documents(docs)
         return True, f"Successfully indexed {len(docs)} chunks."
     except Exception as e:
         logger.error(f"Ingestion failed: {e}")
-        return False, f"System Error: {str(e)}"
-def search_knowledge_base(query: str, username: str, k: int = 10, final_k: int = 4) -> List[Document]:
-    user_db_path = os.path.join(CHROMA_PATH, username)
-    if not os.path.exists(user_db_path):
-        return []
     try:
-        # --- NEW: QUERY EXPANSION ---
         acronym_mgr = AcronymManager()
         expanded_query = acronym_mgr.expand_query(query)
-        if expanded_query != query:
-            logger.info(f"Query Expanded: '{query}' -> '{expanded_query}'")
-        else:
-            expanded_query = query
-        # ----------------------------
-        # 1. Vector Retrieval (Use expanded_query instead of query)
         emb_fn = get_embedding_func()
-        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-        results = db.similarity_search_with_relevance_scores(expanded_query, k=k) # <--- UPDATED VAR
-        if not results:
-            return []
-        # 2. Reranking (Pass expanded_query here too)
-        candidate_docs = [doc for doc, _ in results]
         candidate_texts = [doc.page_content for doc in candidate_docs]
-        pairs = [[expanded_query, text] for text in candidate_texts] # <--- UPDATED VAR
         reranker = get_rerank_model()
         scores = reranker.predict(pairs)
-        # Sort by new score
         scored_docs = list(zip(candidate_docs, scores))
         scored_docs.sort(key=lambda x: x[1], reverse=True)
@@ -251,67 +249,42 @@ def search_knowledge_base(query: str, username: str, k: int = 10, final_k: int =
 def list_documents(username: str) -> List[dict]:
     """
-    Returns a list of unique files currently in the vector database.
-    (Used for the sidebar list)
     """
-    user_db_path = os.path.join(CHROMA_PATH, username)
-    if not os.path.exists(user_db_path):
-        return []
-    try:
-        emb_fn = get_embedding_func()
-        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-        # Chroma's .get() returns all metadata
-        data = db.get()
-        metadatas = data['metadatas']
-        inventory = {}
-        for m in metadatas:
-            # Metadata keys might differ slightly, handle gracefully
-            src = m.get('source', 'Unknown')
-            if src not in inventory:
-                inventory[src] = {
-                    "chunks": 0,
-                    "strategy": m.get('strategy', 'unknown')
-                }
-            inventory[src]["chunks"] += 1
-        # FIXED: Added "source": k to the dictionary below
-        return [
-            {"filename": k, "chunks": v["chunks"], "strategy": v["strategy"], "source": k}
-            for k, v in inventory.items()
-        ]
-    except Exception as e:
-        logger.error(f"Error listing docs: {e}")
-        return []
-def delete_document(username: str, filename: str) -> Tuple[bool, str]:
-    """Removes a document from the vector database."""
-    user_db_path = os.path.join(CHROMA_PATH, username)
     try:
-        emb_fn = get_embedding_func()
-        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-        data = db.get()
-        ids_to_delete = []
-        for i, meta in enumerate(data['metadatas']):
-            if meta.get('source') == filename:
-                ids_to_delete.append(data['ids'][i])
-        if ids_to_delete:
-            db.delete(ids=ids_to_delete)
-            return True, f"Deleted {filename}."
-        else:
-            return False, "File not found in index."
     except Exception as e:
-        return False, f"Delete failed: {e}"
 def reset_knowledge_base(username: str) -> Tuple[bool, str]:
-    """Nukes the user's database folder."""
-    user_db_path = os.path.join(CHROMA_PATH, username)
-    if os.path.exists(user_db_path):
-        shutil.rmtree(user_db_path)
-        return True, "Database Reset."
-    return False, "Database already empty."

 from typing import List, Literal, Tuple
 # --- LANGCHAIN & DB IMPORTS ---
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_core.documents import Document
 from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
 from sentence_transformers import CrossEncoder
 # --- CUSTOM CORE IMPORTS ---
+from core.PineconeManager import PineconeManager
 from core.ParagraphChunker import ParagraphChunker
 from core.TokenChunker import TokenChunker
 from core.AcronymManager import AcronymManager
 # --- CONFIGURATION ---
 UPLOAD_DIR = "source_documents"
 EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
 RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+PINECONE_KEY = os.getenv("PINECONE_API_KEY")
 # Configure Logging
 logging.basicConfig(level=logging.INFO)
         logger.warning(f"Unsupported file extension: {file_extension}")
         return []
+# --- PART 2: DATABASE & FILE MANAGEMENT (Pinecone Version) ---
 def save_uploaded_file(uploaded_file, username: str = "default") -> str:
     """Saves a StreamlitUploadedFile to disk so the loaders can read it."""
         with open(file_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
         return file_path
     except Exception as e:
         logger.error(f"Error saving file: {e}")
         return None
+def process_and_add_text(text: str, source_name: str, username: str, index_name: str) -> Tuple[bool, str]:
+    """Ingests raw text (Flattener) -> Saves Backup to Disk -> Uploads to Pinecone."""
+    if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
     try:
+        # 1. SAVE PHYSICAL BACKUP (For Quiz Engine)
+        user_docs_dir = os.path.join(UPLOAD_DIR, username)
+        os.makedirs(user_docs_dir, exist_ok=True)
+        backup_path = os.path.join(user_docs_dir, source_name)
+        with open(backup_path, "w", encoding='utf-8') as f:
+            f.write(text)
+        # 2. UPLOAD TO PINECONE
+        pm = PineconeManager(PINECONE_KEY)
+        emb_fn = get_embedding_func()
+        # Create Document
         doc = Document(
             page_content=text,
+            metadata={"source": source_name, "strategy": "flattened", "file_type": "generated"}
         )
+        # Add to VectorStore (Namespace = Username)
+        vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
+        vstore.add_documents([doc])
+        return True, f"Indexed and backed up: {source_name}"
     except Exception as e:
+        logger.error(f"Error indexing text: {e}")
+        return False, str(e)
+def ingest_file(file_path: str, username: str, index_name: str, strategy: str = "paragraph") -> Tuple[bool, str]:
+    """Chunks File -> Scans Acronyms -> Uploads to Pinecone."""
+    if not PINECONE_KEY or not index_name: return False, "Pinecone Configuration Missing."
     try:
+        # 1. Chunking
         docs = process_file(file_path, chunking_strategy=strategy)
+        if not docs: return False, "No valid chunks generated."
+        # 2. Acronym Learning
         acronym_mgr = AcronymManager()
         for doc in docs:
             acronym_mgr.scan_text_for_acronyms(doc.page_content)
+        # 3. Pinecone Safety Check
+        pm = PineconeManager(PINECONE_KEY)
+        if not pm.check_dimension_compatibility(index_name, 384):
+            return False, f"Dimension Mismatch! Index {index_name} is not 384d."
+        # 4. Upload
         emb_fn = get_embedding_func()
+        vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
+        vstore.add_documents(docs)
         return True, f"Successfully indexed {len(docs)} chunks."
     except Exception as e:
         logger.error(f"Ingestion failed: {e}")
+        return False, str(e)
+def search_knowledge_base(query: str, username: str, index_name: str, k: int = 10, final_k: int = 4) -> List[Document]:
+    """Retrieves from Pinecone -> Reranks."""
+    if not PINECONE_KEY or not index_name: return []
     try:
+        # 1. Expand Query (Acronyms)
         acronym_mgr = AcronymManager()
         expanded_query = acronym_mgr.expand_query(query)
+        # 2. Vector Search
+        pm = PineconeManager(PINECONE_KEY)
         emb_fn = get_embedding_func()
+        vstore = pm.get_vectorstore(index_name, emb_fn, namespace=username)
+        results = vstore.similarity_search(expanded_query, k=k)
+        if not results: return []
+        # 3. Reranking
+        candidate_docs = results
         candidate_texts = [doc.page_content for doc in candidate_docs]
+        pairs = [[expanded_query, text] for text in candidate_texts]
         reranker = get_rerank_model()
         scores = reranker.predict(pairs)
+        # Sort
         scored_docs = list(zip(candidate_docs, scores))
         scored_docs.sort(key=lambda x: x[1], reverse=True)
 def list_documents(username: str) -> List[dict]:
     """
+    NOTE: Pinecone does not support easy listing of all unique files.
+    We return the Local Cache (source_documents) as a proxy for what is
+    available for the Quiz Engine.
     """
+    user_dir = os.path.join(UPLOAD_DIR, username)
+    if not os.path.exists(user_dir): return []
+    files = []
+    for f in os.listdir(user_dir):
+        if f.lower().endswith(('.pdf', '.txt', '.md')):
+            files.append({"filename": f, "source": f, "strategy": "local_cache"})
+    return files
+def delete_document(username: str, filename: str, index_name: str) -> Tuple[bool, str]:
+    """Deletes from Pinecone AND Local Disk."""
+    if not PINECONE_KEY or not index_name: return False, "Config Missing."
     try:
+        # 1. Delete from Pinecone
+        pm = PineconeManager(PINECONE_KEY)
+        pm.delete_file(index_name, filename, namespace=username)
+        # 2. Delete from Disk (Clean up Quiz Cache)
+        local_path = os.path.join(UPLOAD_DIR, username, filename)
+        if os.path.exists(local_path):
+            os.remove(local_path)
+        return True, f"Deleted {filename} from Index and Disk."
     except Exception as e:
+        return False, str(e)
 def reset_knowledge_base(username: str) -> Tuple[bool, str]:
+    """
+    WARNING: This deletes the USER NAMESPACE in Pinecone, not the whole Index.
+    """
+    # Pinecone delete_all is index-wide usually.
+    # For safety in namespace-based multi-tenancy, we usually skip this
+    # or implement a delete_all(delete_all=True, namespace=username)
+    return False, "Resetting entire DB via API is disabled for safety. Use Delete."