Spaces:
Sleeping
Sleeping
Update pipeline.py
Browse files- pipeline.py +22 -11
pipeline.py
CHANGED
|
@@ -3,6 +3,7 @@ import time
|
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import List, Any, Union
|
| 5 |
import asyncio # Import asyncio for concurrent operations
|
|
|
|
| 6 |
|
| 7 |
from llama_index.core import Document, StorageContext, VectorStoreIndex, Settings
|
| 8 |
from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes, get_root_nodes
|
|
@@ -11,6 +12,7 @@ from llama_index.core.storage.docstore import SimpleDocumentStore
|
|
| 11 |
from llama_index.readers.file import PyMuPDFReader
|
| 12 |
from llama_index.llms.groq import Groq
|
| 13 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
class Pipeline:
|
|
@@ -33,7 +35,8 @@ class Pipeline:
|
|
| 33 |
self.groq_api_key = groq_api_key
|
| 34 |
self.pdf_path = Path(pdf_path)
|
| 35 |
self.embed_model = embed_model
|
| 36 |
-
|
|
|
|
| 37 |
# Configure Llama-Index LLM setting only
|
| 38 |
Settings.llm = Groq(model="llama3-70b-8192", api_key=self.groq_api_key)
|
| 39 |
|
|
@@ -107,26 +110,34 @@ class Pipeline:
|
|
| 107 |
end_time_embeddings = time.perf_counter()
|
| 108 |
print(f"Embeddings generated for {len(self.leaf_nodes)} nodes in {end_time_embeddings - start_time_embeddings:.2f} seconds.")
|
| 109 |
|
| 110 |
-
#
|
| 111 |
-
print("Building VectorStoreIndex...")
|
| 112 |
start_time_index_build = time.perf_counter()
|
| 113 |
|
| 114 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
docstore = SimpleDocumentStore()
|
| 116 |
docstore.add_documents(self.nodes)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
# When nodes already have embeddings, VectorStoreIndex will use them
|
| 121 |
self.index = VectorStoreIndex(
|
| 122 |
-
self.leaf_nodes,
|
| 123 |
storage_context=self.storage_context,
|
| 124 |
-
embed_model=self.embed_model
|
| 125 |
)
|
| 126 |
end_time_index_build = time.perf_counter()
|
| 127 |
-
print(f"VectorStoreIndex built in {end_time_index_build - start_time_index_build:.2f} seconds.")
|
| 128 |
print(f"Total index generation and embedding process completed in {end_time_index_build - start_time_embeddings:.2f} seconds.")
|
| 129 |
-
|
| 130 |
|
| 131 |
def _setup_retriever(self) -> None:
|
| 132 |
"""Sets up the retriever."""
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
from typing import List, Any, Union
|
| 5 |
import asyncio # Import asyncio for concurrent operations
|
| 6 |
+
import faiss
|
| 7 |
|
| 8 |
from llama_index.core import Document, StorageContext, VectorStoreIndex, Settings
|
| 9 |
from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes, get_root_nodes
|
|
|
|
| 12 |
from llama_index.readers.file import PyMuPDFReader
|
| 13 |
from llama_index.llms.groq import Groq
|
| 14 |
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
| 15 |
+
from llama_index.vector_stores.faiss import FaissVectorStore
|
| 16 |
|
| 17 |
|
| 18 |
class Pipeline:
|
|
|
|
| 35 |
self.groq_api_key = groq_api_key
|
| 36 |
self.pdf_path = Path(pdf_path)
|
| 37 |
self.embed_model = embed_model
|
| 38 |
+
# The embedding dimension for 'all-MiniLM-L6-v2' is 384
|
| 39 |
+
self.d = 384
|
| 40 |
# Configure Llama-Index LLM setting only
|
| 41 |
Settings.llm = Groq(model="llama3-70b-8192", api_key=self.groq_api_key)
|
| 42 |
|
|
|
|
| 110 |
end_time_embeddings = time.perf_counter()
|
| 111 |
print(f"Embeddings generated for {len(self.leaf_nodes)} nodes in {end_time_embeddings - start_time_embeddings:.2f} seconds.")
|
| 112 |
|
| 113 |
+
# --- FAISS Integration ---
|
| 114 |
+
print("Building VectorStoreIndex with FAISS...")
|
| 115 |
start_time_index_build = time.perf_counter()
|
| 116 |
|
| 117 |
+
# 1. Create a FAISS index
|
| 118 |
+
faiss_index = faiss.IndexFlatL2(self.d)
|
| 119 |
+
|
| 120 |
+
# 2. Create the FaissVectorStore instance
|
| 121 |
+
vector_store = FaissVectorStore(faiss_index=faiss_index)
|
| 122 |
+
|
| 123 |
+
# 3. Create the StorageContext, passing in our custom vector store
|
| 124 |
docstore = SimpleDocumentStore()
|
| 125 |
docstore.add_documents(self.nodes)
|
| 126 |
+
self.storage_context = StorageContext.from_defaults(
|
| 127 |
+
docstore=docstore,
|
| 128 |
+
vector_store=vector_store # Use the FAISS vector store
|
| 129 |
+
)
|
| 130 |
|
| 131 |
+
# 4. Build the index. LlamaIndex will now use FaissVectorStore internally.
|
|
|
|
|
|
|
| 132 |
self.index = VectorStoreIndex(
|
| 133 |
+
self.leaf_nodes,
|
| 134 |
storage_context=self.storage_context,
|
| 135 |
+
embed_model=self.embed_model
|
| 136 |
)
|
| 137 |
end_time_index_build = time.perf_counter()
|
| 138 |
+
print(f"VectorStoreIndex with FAISS built in {end_time_index_build - start_time_index_build:.2f} seconds.")
|
| 139 |
print(f"Total index generation and embedding process completed in {end_time_index_build - start_time_embeddings:.2f} seconds.")
|
| 140 |
+
|
| 141 |
|
| 142 |
def _setup_retriever(self) -> None:
|
| 143 |
"""Sets up the retriever."""
|