Spaces:

PercivalFletcher
/

Shreyansh-HackRx

Sleeping

App Files Files Community

PercivalFletcher commited on Aug 7, 2025

Commit

f69ffb2

verified ·

1 Parent(s): 6ba91ab

Update rag_utils.py

Browse files

Files changed (1) hide show

rag_utils.py +61 -53

rag_utils.py CHANGED Viewed

@@ -8,46 +8,34 @@ from rank_bm25 import BM25Okapi
 from sentence_transformers import SentenceTransformer
 from sklearn.preprocessing import MinMaxScaler
 import numpy as np
-from typing import Any, List
 import asyncio
-import torch  # Import torch for GPU operations
 # --- Configuration (can be overridden by the calling app) ---
 CHUNK_SIZE = 1000
 CHUNK_OVERLAP = 200
-TOP_K_CHUNKS = 5
 GROQ_MODEL_NAME = "llama3-8b-8192"
-EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"  # A good general-purpose embedding model
 # --- Class for managing the Sentence Transformer model ---
 class EmbeddingClient:
     """A client for generating text embeddings using a local, open-source model."""
     def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
-        """
-        Initializes the SentenceTransformer model and moves it to the GPU if available.
-        """
-        # Determine if a GPU is available and set the device accordingly
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self.device}")
-        # Load the model and move it to the determined device (GPU or CPU)
         self.model = SentenceTransformer(model_name, device=self.device)
         print(f"Sentence Transformer embedding client initialized ({model_name}) on {self.device}.")
     def get_embeddings(self, texts: List[str]) -> torch.Tensor:
-        """
-        Generates embeddings for a list of text chunks on the GPU.
-        Args:
-            texts: A list of strings (our document chunks) to be embedded.
-        Returns:
-            A tensor of embedding vectors on the GPU.
-        """
         if not texts:
             return torch.tensor([])
         print(f"Generating embeddings for {len(texts)} text chunks on {self.device}...")
-        # The .encode() method efficiently converts a list of texts into embeddings.
-        # It handles moving the data to the correct device internally.
         embeddings = self.model.encode(texts, convert_to_tensor=True, show_progress_bar=False)
         print("Embeddings generated successfully.")
         return embeddings
@@ -56,88 +44,112 @@ class EmbeddingClient:
 class HybridSearchManager:
     """
     Manages the initialization and execution of a hybrid search system
-    combining BM25 and dense vector search, with GPU acceleration.
     """
     def __init__(self, embedding_model_name: str = EMBEDDING_MODEL_NAME):
         self.bm25_model = None
         self.embedding_client = EmbeddingClient(model_name=embedding_model_name)
         self.document_chunks = []
         self.document_embeddings = None
     async def initialize_models(self, documents: list[Document]):
-        """
-        Initializes BM25 and computes document embeddings on the GPU.
-        """
         self.document_chunks = documents
         corpus = [doc.page_content for doc in documents]
         if not corpus:
             print("No documents to initialize. Skipping model setup.")
             return
-        # Initialize BM25 model (CPU-bound)
         print("Initializing BM25 model...")
         tokenized_corpus = [doc.split(" ") for doc in corpus]
         self.bm25_model = BM25Okapi(tokenized_corpus)
         print("BM25 model initialized.")
-        # Compute and store document embeddings on the GPU
         print(f"Computing and storing document embeddings on {self.embedding_client.device}...")
         self.document_embeddings = self.embedding_client.get_embeddings(corpus)
         print("Document embeddings computed.")
-    async def perform_hybrid_search(self, query: str, top_k: int) -> list[dict]:
         """
-        Performs a hybrid search using BM25 and dense vectors, with GPU acceleration for dense search.
         """
         if self.bm25_model is None or self.document_embeddings is None:
             raise ValueError("Hybrid search models are not initialized. Call initialize_models first.")
         print(f"Performing hybrid search for query: '{query}' (top_k={top_k})...")
-        # BM25 search (CPU-bound)
         tokenized_query = query.split(" ")
         bm25_scores = self.bm25_model.get_scores(tokenized_query)
-        # Dense search (GPU-bound)
-        # Get query embedding on the GPU
         query_embedding = self.embedding_client.get_embeddings([query])
-        # Perform cosine similarity on the GPU
         from torch.nn.functional import cosine_similarity
         dense_scores = cosine_similarity(query_embedding, self.document_embeddings)
-        # Move dense scores back to CPU for subsequent processing
         dense_scores = dense_scores.cpu().numpy()
         if len(bm25_scores) == 0 or len(dense_scores) == 0:
-            return []
         scaler = MinMaxScaler()
         normalized_bm25_scores = scaler.fit_transform(bm25_scores.reshape(-1, 1)).flatten()
         normalized_dense_scores = scaler.fit_transform(dense_scores.reshape(-1, 1)).flatten()
         combined_scores = 0.5 * normalized_bm25_scores + 0.5 * normalized_dense_scores
         ranked_indices = np.argsort(combined_scores)[::-1]
-        top_k_indices = ranked_indices[:top_k]
         retrieved_results = []
-        for idx in top_k_indices:
             doc = self.document_chunks[idx]
             retrieved_results.append({
                 "content": doc.page_content,
                 "document_metadata": doc.metadata
             })
-        print(f"Retrieved {len(retrieved_results)} top chunks using hybrid search.")
-        return retrieved_results
-# --- Helper Functions (remain unchanged as they are not GPU-intensive) ---
 def process_markdown_with_manual_sections(
     md_file_path: str,
     headings_json: dict,
     chunk_size: int,
     chunk_overlap: int):
-    """
-    Processes a markdown document from a file path by segmenting it based on
-    provided section headings, and then recursively chunking each segment.
-    Each chunk receives the corresponding section heading as metadata.
-    """
     all_chunks_with_metadata = []
     full_text = ""
     if not os.path.exists(md_file_path):
@@ -168,7 +180,6 @@ def process_markdown_with_manual_sections(
     heading_positions = []
     for heading in heading_texts:
         pattern = re.compile(r'\s*'.join(re.escape(word) for word in heading.split()), re.IGNORECASE)
         match = pattern.search(full_text)
         if match:
             heading_positions.append({"heading_text": heading, "start_index": match.start()})
@@ -176,7 +187,6 @@ def process_markdown_with_manual_sections(
             print(f"Warning: Heading '{heading}' not found in the markdown text using regex. This section might be missed.")
     heading_positions.sort(key=lambda x: x["start_index"])
     segments_with_headings = []
     if heading_positions and heading_positions[0]["start_index"] > 0:
         preface_text = full_text[:heading_positions[0]["start_index"]].strip()
         if preface_text:
@@ -187,12 +197,10 @@ def process_markdown_with_manual_sections(
     for i, current_heading_info in enumerate(heading_positions):
         start_index = current_heading_info["start_index"]
         heading_text = current_heading_info["heading_text"]
         end_index = len(full_text)
         if i + 1 < len(heading_positions):
             end_index = heading_positions[i+1]["start_index"]
         section_content = full_text[start_index:end_index].strip()
         if section_content:
             segments_with_headings.append({
                 "section_heading": heading_text,

 from sentence_transformers import SentenceTransformer
 from sklearn.preprocessing import MinMaxScaler
 import numpy as np
+from typing import Any, List, Tuple
 import asyncio
+import torch
+import time
+from flashrank import Ranker, RerankRequest # Import the FlashRank library
 # --- Configuration (can be overridden by the calling app) ---
 CHUNK_SIZE = 1000
 CHUNK_OVERLAP = 200
+TOP_K_CHUNKS = 5  # The final number of chunks to send to the LLM
+# A larger number of initial candidates for reranking
+INITIAL_K_CANDIDATES = 20
 GROQ_MODEL_NAME = "llama3-8b-8192"
+EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # A good general-purpose embedding model
 # --- Class for managing the Sentence Transformer model ---
 class EmbeddingClient:
     """A client for generating text embeddings using a local, open-source model."""
     def __init__(self, model_name: str = EMBEDDING_MODEL_NAME):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Using device: {self.device}")
         self.model = SentenceTransformer(model_name, device=self.device)
         print(f"Sentence Transformer embedding client initialized ({model_name}) on {self.device}.")
     def get_embeddings(self, texts: List[str]) -> torch.Tensor:
         if not texts:
             return torch.tensor([])
         print(f"Generating embeddings for {len(texts)} text chunks on {self.device}...")
         embeddings = self.model.encode(texts, convert_to_tensor=True, show_progress_bar=False)
         print("Embeddings generated successfully.")
         return embeddings
 class HybridSearchManager:
     """
     Manages the initialization and execution of a hybrid search system
+    combining BM25, dense vector search, and a fast reranker.
     """
     def __init__(self, embedding_model_name: str = EMBEDDING_MODEL_NAME):
         self.bm25_model = None
         self.embedding_client = EmbeddingClient(model_name=embedding_model_name)
         self.document_chunks = []
         self.document_embeddings = None
+        self.reranker = Ranker() # Initialize the FlashRank reranker
+        print("FlashRank reranker initialized.")
     async def initialize_models(self, documents: list[Document]):
         self.document_chunks = documents
         corpus = [doc.page_content for doc in documents]
         if not corpus:
             print("No documents to initialize. Skipping model setup.")
             return
         print("Initializing BM25 model...")
         tokenized_corpus = [doc.split(" ") for doc in corpus]
         self.bm25_model = BM25Okapi(tokenized_corpus)
         print("BM25 model initialized.")
         print(f"Computing and storing document embeddings on {self.embedding_client.device}...")
         self.document_embeddings = self.embedding_client.get_embeddings(corpus)
         print("Document embeddings computed.")
+    async def perform_hybrid_search(self, query: str, top_k: int) -> Tuple[List[dict], float]:
         """
+        Performs a hybrid search, then reranks the results, and returns the top chunks
+        along with the time taken for reranking.
         """
         if self.bm25_model is None or self.document_embeddings is None:
             raise ValueError("Hybrid search models are not initialized. Call initialize_models first.")
         print(f"Performing hybrid search for query: '{query}' (top_k={top_k})...")
+        # Get a larger number of chunks for a better reranking pool
         tokenized_query = query.split(" ")
         bm25_scores = self.bm25_model.get_scores(tokenized_query)
         query_embedding = self.embedding_client.get_embeddings([query])
         from torch.nn.functional import cosine_similarity
         dense_scores = cosine_similarity(query_embedding, self.document_embeddings)
         dense_scores = dense_scores.cpu().numpy()
         if len(bm25_scores) == 0 or len(dense_scores) == 0:
+            return [], 0.0
         scaler = MinMaxScaler()
         normalized_bm25_scores = scaler.fit_transform(bm25_scores.reshape(-1, 1)).flatten()
         normalized_dense_scores = scaler.fit_transform(dense_scores.reshape(-1, 1)).flatten()
         combined_scores = 0.5 * normalized_bm25_scores + 0.5 * normalized_dense_scores
+        # We now get `INITIAL_K_CANDIDATES` documents from the combined search
         ranked_indices = np.argsort(combined_scores)[::-1]
+        top_initial_indices = ranked_indices[:INITIAL_K_CANDIDATES]
         retrieved_results = []
+        for idx in top_initial_indices:
             doc = self.document_chunks[idx]
             retrieved_results.append({
                 "content": doc.page_content,
                 "document_metadata": doc.metadata
             })
+        print(f"Retrieved {len(retrieved_results)} initial chunks. Starting reranking...")
+        # --- Reranking Step with Timing ---
+        start_time_rerank = time.perf_counter()
+        if not retrieved_results:
+            return [], 0.0
+        # FlashRank expects a list of dictionaries with a "text" key
+        passages = [{"text": chunk["content"]} for chunk in retrieved_results]
+        # The reranker takes a query and a list of passages and returns a reranked list
+        reranked_results = await asyncio.to_thread(
+            self.reranker.rerank, RerankRequest(query=query, passages=passages)
+        )
+        end_time_rerank = time.perf_counter()
+        rerank_time = end_time_rerank - start_time_rerank
+        # Re-map the reranked results back to our original document format
+        final_chunks = []
+        for res in reranked_results:
+            # Find the original chunk based on the text
+            original_chunk_data = next(
+                (c for c in retrieved_results if c["content"] == res["text"]),
+                None
+            )
+            if original_chunk_data:
+                final_chunks.append({
+                    "content": original_chunk_data["content"],
+                    "document_metadata": original_chunk_data["document_metadata"],
+                    "rerank_score": res["score"]
+                })
+        # Return the top_k reranked chunks and the timing information
+        print(f"Reranking completed in {rerank_time:.4f} seconds. Retrieved {len(final_chunks[:top_k])} top chunks.")
+        return final_chunks[:top_k], rerank_time
+# --- Helper Functions (remain unchanged) ---
 def process_markdown_with_manual_sections(
     md_file_path: str,
     headings_json: dict,
     chunk_size: int,
     chunk_overlap: int):
     all_chunks_with_metadata = []
     full_text = ""
     if not os.path.exists(md_file_path):
     heading_positions = []
     for heading in heading_texts:
         pattern = re.compile(r'\s*'.join(re.escape(word) for word in heading.split()), re.IGNORECASE)
         match = pattern.search(full_text)
         if match:
             heading_positions.append({"heading_text": heading, "start_index": match.start()})
             print(f"Warning: Heading '{heading}' not found in the markdown text using regex. This section might be missed.")
     heading_positions.sort(key=lambda x: x["start_index"])
     segments_with_headings = []
     if heading_positions and heading_positions[0]["start_index"] > 0:
         preface_text = full_text[:heading_positions[0]["start_index"]].strip()
         if preface_text:
     for i, current_heading_info in enumerate(heading_positions):
         start_index = current_heading_info["start_index"]
         heading_text = current_heading_info["heading_text"]
         end_index = len(full_text)
         if i + 1 < len(heading_positions):
             end_index = heading_positions[i+1]["start_index"]
         section_content = full_text[start_index:end_index].strip()
         if section_content:
             segments_with_headings.append({
                 "section_heading": heading_text,