Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

App Files Files Community

NavyDevilDoc commited on Dec 20, 2025

Commit

6695d4a

verified ·

1 Parent(s): 4c360e5

Update src/rag_engine.py

Browse files

refactored to use custom text splitting code

Files changed (1) hide show

src/rag_engine.py +114 -230

src/rag_engine.py CHANGED Viewed

@@ -1,254 +1,138 @@
 import os
-import shutil
-import time
-from langchain_text_splitters import RecursiveCharacterTextSplitter, TokenTextSplitter
-from langchain_chroma import Chroma
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_community.docstore.document import Document
-from sentence_transformers import CrossEncoder # Re-added for Reranking
-import doc_loader
-# --- CONFIGURATION ---
-CHROMA_PATH = "chroma_db"
-UPLOAD_DIR = "temp_ingest" # Re-added directory constant
-EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
-RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2" # Re-added model name
-# --- LAZY LOADING GLOBALS ---
-# We use a global variable pattern to avoid loading heavy models
-# until the moment they are actually needed (saves startup RAM).
-_embedding_func = None
-_rerank_model = None
-def get_embedding_func():
-    """Lazy loads the embedding model."""
-    global _embedding_func
-    if _embedding_func is None:
-        print(f"⏳ Loading Embedding Model: {EMBED_MODEL_NAME}...")
-        _embedding_func = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
-        print("✅ Embedding Model Loaded.")
-    return _embedding_func
-def get_rerank_model():
-    """Lazy loads the Cross-Encoder model."""
-    global _rerank_model
-    if _rerank_model is None:
-        print(f"⏳ Loading Reranker: {RERANK_MODEL_NAME}...")
-        _rerank_model = CrossEncoder(RERANK_MODEL_NAME)
-        print("✅ Reranker Loaded.")
-    return _rerank_model
-# --- FILE OPERATIONS ---
-def save_uploaded_file(uploaded_file):
-    """Saves uploaded file to the temp directory."""
-    os.makedirs(UPLOAD_DIR, exist_ok=True)
-    file_path = os.path.join(UPLOAD_DIR, uploaded_file.name)
-    with open(file_path, "wb") as f:
-        f.write(uploaded_file.getbuffer())
-    return file_path
-# --- INGESTION PIPELINE ---
-def process_and_add_document(file_path, username, strategy, use_vision=False, api_key=None):
     """
-    Ingests a document using the Universal Loader and adds it to the user's vector DB.
     """
-    user_db_path = os.path.join(CHROMA_PATH, username)
     try:
-        # 1. EXTRACT TEXT (Using doc_loader)
-        # We need a pseudo-object because doc_loader expects a Streamlit object,
-        # but we are reading from disk.
-        with open(file_path, "rb") as f:
-            class FileObj:
-                def __init__(self, f, name):
-                    self.f = f
-                    self.name = name
-                def read(self): return self.f.read()
-            file_obj = FileObj(f, os.path.basename(file_path))
-            raw_text = doc_loader.extract_text_from_file(file_obj, use_vision=use_vision, api_key=api_key)
-        if not raw_text or not raw_text.strip():
-            return False, "Document appears empty or could not be read."
-        # 2. CHUNK TEXT
-        chunks = []
-        if strategy == "paragraph":
-            splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-            chunks = splitter.split_text(raw_text)
-        elif strategy == "token":
-            splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=50)
-            chunks = splitter.split_text(raw_text)
-        elif strategy == "page":
-            splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
-            chunks = splitter.split_text(raw_text)
-        # 3. CREATE DOCUMENTS
-        docs = [
-            Document(
-                page_content=chunk,
-                metadata={"source": os.path.basename(file_path), "strategy": strategy}
-            )
-            for chunk in chunks
         ]
-        # 4. INDEX TO CHROMA
-        if docs:
-            # Use the getter function (Lazy Load)
-            emb_fn = get_embedding_func()
-            db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-            db.add_documents(docs)
-            return True, f"Successfully indexed {len(docs)} chunks from {os.path.basename(file_path)}."
-        else:
-            return False, "No chunks created."
-    except Exception as e:
-        return False, f"Error processing document: {e}"
-# --- SEARCH PIPELINE (Now with Reranking!) ---
-def search_knowledge_base(query, username, k=10, final_k=4):
-    """
-    Retrieves top K chunks, then uses a Cross-Encoder to re-rank them
-    and returns the top final_k most relevant chunks.
-    """
-    user_db_path = os.path.join(CHROMA_PATH, username)
-    if not os.path.exists(user_db_path):
-        return []
-    try:
-        # 1. INITIAL RETRIEVAL (Vector Similarity)
-        emb_fn = get_embedding_func()
-        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-        # Fetch more candidates (k=10) to give the reranker options
-        results = db.similarity_search_with_relevance_scores(query, k=k)
-        if not results:
-            return []
-        # 2. RERANKING
-        # Extract just the text for the cross-encoder
-        candidate_docs = [doc for doc, _ in results]
-        candidate_texts = [doc.page_content for doc in candidate_docs]
-        if not candidate_texts:
-            return []
-        # Form pairs: (Query, Document Text)
-        pairs = [[query, text] for text in candidate_texts]
-        # Score pairs
-        reranker = get_rerank_model()
-        scores = reranker.predict(pairs)
-        # Attach scores to documents and sort
-        scored_docs = list(zip(candidate_docs, scores))
-        # Sort by score descending (High score = Better match)
-        scored_docs.sort(key=lambda x: x[1], reverse=True)
-        # 3. RETURN TOP N
-        # Return only the document objects of the top final_k
-        final_docs = [doc for doc, score in scored_docs[:final_k]]
         return final_docs
     except Exception as e:
-        print(f"RAG Error: {e}")
         return []
-# --- MANAGEMENT UTILS ---
-def list_documents(username):
-    """Returns a list of unique sources in the user's DB."""
-    user_db_path = os.path.join(CHROMA_PATH, username)
-    if not os.path.exists(user_db_path):
-        return []
-    try:
-        emb_fn = get_embedding_func()
-        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-        data = db.get()
-        metadatas = data['metadatas']
-        inventory = {}
-        for m in metadatas:
-            src = m.get('source', 'Unknown')
-            if src not in inventory:
-                inventory[src] = {"chunks": 0, "strategy": m.get('strategy', 'Unknown')}
-            inventory[src]["chunks"] += 1
-        return [{"filename": k, "chunks": v["chunks"], "strategy": v["strategy"], "source": k} for k, v in inventory.items()]
-    except:
         return []
-def delete_document(username, source_name):
-    """Removes all chunks associated with a specific source file."""
-    user_db_path = os.path.join(CHROMA_PATH, username)
-    try:
-        emb_fn = get_embedding_func()
-        db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-        data = db.get()
-        ids_to_delete = []
-        for i, meta in enumerate(data['metadatas']):
-            if meta.get('source') == source_name:
-                ids_to_delete.append(data['ids'][i])
-        if ids_to_delete:
-            db.delete(ids=ids_to_delete)
-            return True, f"Deleted {source_name}."
         else:
-            return False, "File not found in index."
-    except Exception as e:
-        return False, f"Delete failed: {e}"
-def reset_knowledge_base(username):
-    """Wipes the entire user database."""
-    user_db_path = os.path.join(CHROMA_PATH, username)
-    if os.path.exists(user_db_path):
-        shutil.rmtree(user_db_path)
-        return True, "Database Reset."
-    return False, "Database already empty."
-def process_and_add_text(raw_text, source_name, username, strategy="paragraph"):
     """
-    Directly indexes a raw text string into the user's vector DB.
-    Useful for indexing content generated by the LLM (like flattened notes).
     """
-    user_db_path = os.path.join(CHROMA_PATH, username)
-    try:
-        if not raw_text or not raw_text.strip():
-            return False, "Content appears empty."
-        # 1. CHUNK TEXT (Reusing the standard logic)
-        chunks = []
-        if strategy == "paragraph":
-            splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-            chunks = splitter.split_text(raw_text)
-        elif strategy == "token":
-            splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=50)
-            chunks = splitter.split_text(raw_text)
-        elif strategy == "page":
-            splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
-            chunks = splitter.split_text(raw_text)
-        # 2. CREATE DOCUMENTS
-        # We append "_flattened" to the source name so you can distinguish it from the original
-        docs = [
-            Document(
-                page_content=chunk,
-                metadata={"source": source_name, "strategy": f"{strategy}-flattened"}
-            )
-            for chunk in chunks
-        ]
-        # 3. INDEX TO CHROMA
-        if docs:
-            emb_fn = get_embedding_func()
-            db = Chroma(persist_directory=user_db_path, embedding_function=emb_fn)
-            db.add_documents(docs)
-            return True, f"Successfully indexed {len(docs)} flattened chunks."
-        else:
-            return False, "No chunks created."
-    except Exception as e:
-        return False, f"Error processing text: {e}"

 import os
+import logging
+from typing import List, Literal
+# LangChain imports for the Markdown logic
+from langchain_core.documents import Document
+from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
+# Custom Core Imports
+from core.ParagraphChunker import ParagraphChunker
+from core.TokenChunker import TokenChunker
+# Configure Logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def _process_markdown(file_path: str, chunk_size: int = 1000, chunk_overlap: int = 100) -> List[Document]:
     """
+    Internal helper to process Markdown files using Header Semantic Splitting.
     """
     try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            markdown_text = f.read()
+        # Define headers to split on (Logic: Keep context attached to the section)
+        headers_to_split_on = [
+            ("#", "Header 1"),
+            ("##", "Header 2"),
+            ("###", "Header 3"),
         ]
+        # Stage 1: Split by Structure (Headers)
+        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
+        md_header_splits = markdown_splitter.split_text(markdown_text)
+        # Stage 2: Split by Size (Recursively split long sections)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap
+        )
+        final_docs = text_splitter.split_documents(md_header_splits)
+        # Add source metadata
+        for doc in final_docs:
+            doc.metadata['source'] = file_path
+            doc.metadata['file_type'] = 'md'
+        logger.info(f"Markdown processing complete: {len(final_docs)} chunks created.")
         return final_docs
     except Exception as e:
+        logger.error(f"Error processing Markdown file {file_path}: {e}")
         return []
+def process_file(
+    file_path: str,
+    chunking_strategy: Literal["paragraph", "token"] = "paragraph",
+    chunk_size: int = 512,
+    chunk_overlap: int = 50,
+    model_name: str = "gpt-4o" # Used for token counting in your custom classes
+) -> List[Document]:
+    """
+    Main entry point for processing a single file.
+    Routes to the correct custom chunker or markdown handler based on extension.
+    """
+    if not os.path.exists(file_path):
+        logger.error(f"File not found: {file_path}")
         return []
+    file_extension = os.path.splitext(file_path)[1].lower()
+    logger.info(f"Processing {file_path} using strategy: {chunking_strategy}")
+    # ---------------------------------------------------------
+    # 1. Handle Markdown (Specialized Logic)
+    # ---------------------------------------------------------
+    if file_extension == ".md":
+        return _process_markdown(file_path, chunk_size, chunk_overlap)
+    # ---------------------------------------------------------
+    # 2. Handle PDF and TXT (Custom Core Logic)
+    # ---------------------------------------------------------
+    elif file_extension in [".pdf", ".txt"]:
+        # Initialize the appropriate Custom Chunker
+        if chunking_strategy == "token":
+            chunker = TokenChunker(
+                model_name=model_name,
+                chunk_size=chunk_size,
+                chunk_overlap=chunk_overlap
+            )
         else:
+            # Paragraph chunker relies on semantic boundaries, not strict sizes
+            chunker = ParagraphChunker(model_name=model_name)
+        # Process based on file type
+        try:
+            if file_extension == ".pdf":
+                # Uses OCREnhancedPDFLoader internally via BaseChunker
+                return chunker.process_document(file_path)
+            elif file_extension == ".txt":
+                # Uses direct text reading with paragraph preservation
+                return chunker.process_text_file(file_path)
+        except Exception as e:
+            logger.error(f"Error using {chunking_strategy} chunker on {file_path}: {e}")
+            return []
+    else:
+        logger.warning(f"Unsupported file extension: {file_extension}")
+        return []
+def load_documents_from_directory(
+    directory_path: str,
+    chunking_strategy: Literal["paragraph", "token"] = "paragraph"
+) -> List[Document]:
     """
+    Batch helper to process a directory of files.
     """
+    all_docs = []
+    for root, _, files in os.walk(directory_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            # Only process supported extensions
+            if file.lower().endswith(('.pdf', '.txt', '.md')):
+                docs = process_file(file_path, chunking_strategy=chunking_strategy)
+                all_docs.extend(docs)
+    return all_docs
+# Quick test block
+if __name__ == "__main__":
+    # Example usage
+    print("--- Testing Rag Engine ---")
+    # You can point this to a dummy file to test
+    # docs = process_file("test_data/navy_manual.pdf", chunking_strategy="paragraph")
+    # print(f"Loaded {len(docs)} chunks.")