Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, Settings | |
| from llama_index.vector_stores.chroma import ChromaVectorStore | |
| from llama_index.core.storage.storage_context import StorageContext | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.llms.groq import Groq | |
| import chromadb | |
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv(dotenv_path="./.env.local") | |
| # --- Configure global settings for Groq and embeddings --- | |
| Settings.llm = Groq( | |
| model="meta-llama/llama-4-scout-17b-16e-instruct", | |
| api_key=os.getenv("GROQ_API_KEY"), | |
| system_prompt="provide information according to context Do NOT guess or make assumptions please do not tell other that overlapping context. Respond briefly in one paragraph.", | |
| ) | |
| Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| class VectorDBManager: | |
| def __init__(self, db_path: str = "./chromafast_db", collection_name: str = "DB_collection"): | |
| self.db_path = db_path | |
| self.collection_name = collection_name | |
| # Persistent Chroma client (never ephemeral) | |
| self.db_client = chromadb.PersistentClient(path=db_path) | |
| self.collection = self.db_client.get_or_create_collection(collection_name) | |
| # Build vector + storage contexts | |
| self.vector_store = ChromaVectorStore(chroma_collection=self.collection) | |
| self.storage_context = StorageContext.from_defaults(vector_store=self.vector_store) | |
| self.index = None | |
| def is_collection_empty(self) -> bool: | |
| """Check if the Chroma collection has any stored embeddings.""" | |
| try: | |
| return len(self.collection.get()["ids"]) == 0 | |
| except Exception: | |
| return True | |
| def build_index_from_documents(self, data_path: str): | |
| """Build and save a new index from document directory.""" | |
| print(f"π Loading documents from: {data_path}") | |
| documents = SimpleDirectoryReader(data_path).load_data() | |
| print(f"π Loaded {len(documents)} documents.") | |
| self.index = VectorStoreIndex.from_documents( | |
| documents, | |
| storage_context=self.storage_context, | |
| ) | |
| print(f"β Index built and stored in Chroma at {self.db_path}") | |
| def load_existing_index(self): | |
| """Load index from existing Chroma vector store.""" | |
| print(f"π¦ Loading existing Chroma DB from {self.db_path}") | |
| self.index = VectorStoreIndex.from_vector_store(self.vector_store) | |
| print("β Loaded existing index successfully") | |
| def get_query_engine(self): | |
| if not self.index: | |
| raise ValueError("β Index not initialized. Build or load it first.") | |
| return self.index.as_query_engine(use_async=True) | |
| def query(self, text: str): | |
| """Run a query against the existing or newly built index.""" | |
| query_engine = self.get_query_engine() | |
| response = query_engine.query(text) | |
| return response | |
| async def aquery(self, text: str): | |
| """Run a query against the existing or newly built index.""" | |
| import time | |
| start_t = time.time() | |
| query_engine = self.get_query_engine() | |
| response = await query_engine.aquery(text) | |
| print(f"π Async query completed in {time.time() - start_t:.2f}s") | |
| return response | |
| if __name__ == "__main__": | |
| DATA_DIR = "../companyData1" | |
| DB_PATH = "../chromafast_db" | |
| manager = VectorDBManager(db_path=DB_PATH, collection_name="DB_collection") | |
| # Detect if DB exists and has embeddings | |
| if not os.path.exists(DB_PATH) or manager.is_collection_empty(): | |
| print("π No existing embeddings found. Building new Chroma DB...") | |
| manager.build_index_from_documents(DATA_DIR) | |
| else: | |
| print("π Existing Chroma DB found. Loading it...") | |
| manager.load_existing_index() | |
| # Test query | |
| question = "What are some of the main contributions of this new bitswits?" | |
| response = manager.query(question) | |
| print("\nπ Query Result:\n") | |
| print(response) | |