Spaces:

PercivalFletcher
/

HackRx

Sleeping

App Files Files Community

PercivalFletcher commited on Aug 7, 2025

Commit

7210c17

verified ·

1 Parent(s): b0d13c1

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +22 -11

pipeline.py CHANGED Viewed

@@ -3,6 +3,7 @@ import time
 from pathlib import Path
 from typing import List, Any, Union
 import asyncio # Import asyncio for concurrent operations
 from llama_index.core import Document, StorageContext, VectorStoreIndex, Settings
 from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes, get_root_nodes
@@ -11,6 +12,7 @@ from llama_index.core.storage.docstore import SimpleDocumentStore
 from llama_index.readers.file import PyMuPDFReader
 from llama_index.llms.groq import Groq
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 class Pipeline:
@@ -33,7 +35,8 @@ class Pipeline:
         self.groq_api_key = groq_api_key
         self.pdf_path = Path(pdf_path)
         self.embed_model = embed_model
         # Configure Llama-Index LLM setting only
         Settings.llm = Groq(model="llama3-70b-8192", api_key=self.groq_api_key)
@@ -107,26 +110,34 @@ class Pipeline:
         end_time_embeddings = time.perf_counter()
         print(f"Embeddings generated for {len(self.leaf_nodes)} nodes in {end_time_embeddings - start_time_embeddings:.2f} seconds.")
-        # Now, build the VectorStoreIndex using the nodes that now have pre-computed embeddings
-        print("Building VectorStoreIndex...")
         start_time_index_build = time.perf_counter()
-        # Add all nodes (root and leaf) to the document store
         docstore = SimpleDocumentStore()
         docstore.add_documents(self.nodes)
-        self.storage_context = StorageContext.from_defaults(docstore=docstore)
-        # When nodes already have embeddings, VectorStoreIndex will use them
         self.index = VectorStoreIndex(
-            self.leaf_nodes, # Pass leaf nodes which now contain their embeddings
             storage_context=self.storage_context,
-            embed_model=self.embed_model # Still pass the embed_model, though it won't re-embed if nodes have embeddings
         )
         end_time_index_build = time.perf_counter()
-        print(f"VectorStoreIndex built in {end_time_index_build - start_time_index_build:.2f} seconds.")
         print(f"Total index generation and embedding process completed in {end_time_index_build - start_time_embeddings:.2f} seconds.")
     def _setup_retriever(self) -> None:
         """Sets up the retriever."""

 from pathlib import Path
 from typing import List, Any, Union
 import asyncio # Import asyncio for concurrent operations
+import faiss
 from llama_index.core import Document, StorageContext, VectorStoreIndex, Settings
 from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes, get_root_nodes
 from llama_index.readers.file import PyMuPDFReader
 from llama_index.llms.groq import Groq
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.vector_stores.faiss import FaissVectorStore
 class Pipeline:
         self.groq_api_key = groq_api_key
         self.pdf_path = Path(pdf_path)
         self.embed_model = embed_model
+        # The embedding dimension for 'all-MiniLM-L6-v2' is 384
+        self.d = 384
         # Configure Llama-Index LLM setting only
         Settings.llm = Groq(model="llama3-70b-8192", api_key=self.groq_api_key)
         end_time_embeddings = time.perf_counter()
         print(f"Embeddings generated for {len(self.leaf_nodes)} nodes in {end_time_embeddings - start_time_embeddings:.2f} seconds.")
+        # --- FAISS Integration ---
+        print("Building VectorStoreIndex with FAISS...")
         start_time_index_build = time.perf_counter()
+        # 1. Create a FAISS index
+        faiss_index = faiss.IndexFlatL2(self.d)
+        # 2. Create the FaissVectorStore instance
+        vector_store = FaissVectorStore(faiss_index=faiss_index)
+        # 3. Create the StorageContext, passing in our custom vector store
         docstore = SimpleDocumentStore()
         docstore.add_documents(self.nodes)
+        self.storage_context = StorageContext.from_defaults(
+            docstore=docstore,
+            vector_store=vector_store # Use the FAISS vector store
+        )
+        # 4. Build the index. LlamaIndex will now use FaissVectorStore internally.
         self.index = VectorStoreIndex(
+            self.leaf_nodes,
             storage_context=self.storage_context,
+            embed_model=self.embed_model
         )
         end_time_index_build = time.perf_counter()
+        print(f"VectorStoreIndex with FAISS built in {end_time_index_build - start_time_index_build:.2f} seconds.")
         print(f"Total index generation and embedding process completed in {end_time_index_build - start_time_embeddings:.2f} seconds.")
     def _setup_retriever(self) -> None:
         """Sets up the retriever."""