PercivalFletcher commited on
Commit
7210c17
·
verified ·
1 Parent(s): b0d13c1

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +22 -11
pipeline.py CHANGED
@@ -3,6 +3,7 @@ import time
3
  from pathlib import Path
4
  from typing import List, Any, Union
5
  import asyncio # Import asyncio for concurrent operations
 
6
 
7
  from llama_index.core import Document, StorageContext, VectorStoreIndex, Settings
8
  from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes, get_root_nodes
@@ -11,6 +12,7 @@ from llama_index.core.storage.docstore import SimpleDocumentStore
11
  from llama_index.readers.file import PyMuPDFReader
12
  from llama_index.llms.groq import Groq
13
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 
14
 
15
 
16
  class Pipeline:
@@ -33,7 +35,8 @@ class Pipeline:
33
  self.groq_api_key = groq_api_key
34
  self.pdf_path = Path(pdf_path)
35
  self.embed_model = embed_model
36
-
 
37
  # Configure Llama-Index LLM setting only
38
  Settings.llm = Groq(model="llama3-70b-8192", api_key=self.groq_api_key)
39
 
@@ -107,26 +110,34 @@ class Pipeline:
107
  end_time_embeddings = time.perf_counter()
108
  print(f"Embeddings generated for {len(self.leaf_nodes)} nodes in {end_time_embeddings - start_time_embeddings:.2f} seconds.")
109
 
110
- # Now, build the VectorStoreIndex using the nodes that now have pre-computed embeddings
111
- print("Building VectorStoreIndex...")
112
  start_time_index_build = time.perf_counter()
113
 
114
- # Add all nodes (root and leaf) to the document store
 
 
 
 
 
 
115
  docstore = SimpleDocumentStore()
116
  docstore.add_documents(self.nodes)
 
 
 
 
117
 
118
- self.storage_context = StorageContext.from_defaults(docstore=docstore)
119
-
120
- # When nodes already have embeddings, VectorStoreIndex will use them
121
  self.index = VectorStoreIndex(
122
- self.leaf_nodes, # Pass leaf nodes which now contain their embeddings
123
  storage_context=self.storage_context,
124
- embed_model=self.embed_model # Still pass the embed_model, though it won't re-embed if nodes have embeddings
125
  )
126
  end_time_index_build = time.perf_counter()
127
- print(f"VectorStoreIndex built in {end_time_index_build - start_time_index_build:.2f} seconds.")
128
  print(f"Total index generation and embedding process completed in {end_time_index_build - start_time_embeddings:.2f} seconds.")
129
-
130
 
131
  def _setup_retriever(self) -> None:
132
  """Sets up the retriever."""
 
3
  from pathlib import Path
4
  from typing import List, Any, Union
5
  import asyncio # Import asyncio for concurrent operations
6
+ import faiss
7
 
8
  from llama_index.core import Document, StorageContext, VectorStoreIndex, Settings
9
  from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes, get_root_nodes
 
12
  from llama_index.readers.file import PyMuPDFReader
13
  from llama_index.llms.groq import Groq
14
  from llama_index.embeddings.huggingface import HuggingFaceEmbedding
15
+ from llama_index.vector_stores.faiss import FaissVectorStore
16
 
17
 
18
  class Pipeline:
 
35
  self.groq_api_key = groq_api_key
36
  self.pdf_path = Path(pdf_path)
37
  self.embed_model = embed_model
38
+ # The embedding dimension for 'all-MiniLM-L6-v2' is 384
39
+ self.d = 384
40
  # Configure Llama-Index LLM setting only
41
  Settings.llm = Groq(model="llama3-70b-8192", api_key=self.groq_api_key)
42
 
 
110
  end_time_embeddings = time.perf_counter()
111
  print(f"Embeddings generated for {len(self.leaf_nodes)} nodes in {end_time_embeddings - start_time_embeddings:.2f} seconds.")
112
 
113
+ # --- FAISS Integration ---
114
+ print("Building VectorStoreIndex with FAISS...")
115
  start_time_index_build = time.perf_counter()
116
 
117
+ # 1. Create a FAISS index
118
+ faiss_index = faiss.IndexFlatL2(self.d)
119
+
120
+ # 2. Create the FaissVectorStore instance
121
+ vector_store = FaissVectorStore(faiss_index=faiss_index)
122
+
123
+ # 3. Create the StorageContext, passing in our custom vector store
124
  docstore = SimpleDocumentStore()
125
  docstore.add_documents(self.nodes)
126
+ self.storage_context = StorageContext.from_defaults(
127
+ docstore=docstore,
128
+ vector_store=vector_store # Use the FAISS vector store
129
+ )
130
 
131
+ # 4. Build the index. LlamaIndex will now use FaissVectorStore internally.
 
 
132
  self.index = VectorStoreIndex(
133
+ self.leaf_nodes,
134
  storage_context=self.storage_context,
135
+ embed_model=self.embed_model
136
  )
137
  end_time_index_build = time.perf_counter()
138
+ print(f"VectorStoreIndex with FAISS built in {end_time_index_build - start_time_index_build:.2f} seconds.")
139
  print(f"Total index generation and embedding process completed in {end_time_index_build - start_time_embeddings:.2f} seconds.")
140
+
141
 
142
  def _setup_retriever(self) -> None:
143
  """Sets up the retriever."""