File size: 4,206 Bytes
de6fb09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from pathlib import Path
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.storage.storage_context import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.groq import Groq
import chromadb
import os
from dotenv import load_dotenv
load_dotenv(dotenv_path="./.env.local")

# --- Configure global settings for Groq and embeddings ---
Settings.llm = Groq(
    model="meta-llama/llama-4-scout-17b-16e-instruct",
    api_key=os.getenv("GROQ_API_KEY"),
    system_prompt="provide information according to context Do NOT guess or make assumptions please do not tell other that overlapping context. Respond briefly in one paragraph.",
)
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")


class VectorDBManager:
    def __init__(self, db_path: str = "./chromafast_db", collection_name: str = "DB_collection"):
        self.db_path = db_path
        self.collection_name = collection_name

        # Persistent Chroma client (never ephemeral)
        self.db_client = chromadb.PersistentClient(path=db_path)
        self.collection = self.db_client.get_or_create_collection(collection_name)

        # Build vector + storage contexts
        self.vector_store = ChromaVectorStore(chroma_collection=self.collection)
        self.storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
        self.index = None

    def is_collection_empty(self) -> bool:
        """Check if the Chroma collection has any stored embeddings."""
        try:
            return len(self.collection.get()["ids"]) == 0
        except Exception:
            return True

    def build_index_from_documents(self, data_path: str):
        """Build and save a new index from document directory."""
        print(f"πŸ“‚ Loading documents from: {data_path}")
        documents = SimpleDirectoryReader(data_path).load_data()
        print(f"πŸ“„ Loaded {len(documents)} documents.")

        self.index = VectorStoreIndex.from_documents(
            documents,
            storage_context=self.storage_context,
        )
        print(f"βœ… Index built and stored in Chroma at {self.db_path}")

    def load_existing_index(self):
        """Load index from existing Chroma vector store."""
        print(f"πŸ“¦ Loading existing Chroma DB from {self.db_path}")
        self.index = VectorStoreIndex.from_vector_store(self.vector_store)
        print("βœ… Loaded existing index successfully")

    def get_query_engine(self):
        if not self.index:  
            raise ValueError("❌ Index not initialized. Build or load it first.")
        return self.index.as_query_engine(use_async=True)

    def query(self, text: str):
        """Run a query against the existing or newly built index."""
        query_engine = self.get_query_engine()
        response = query_engine.query(text)
        return response
    async def aquery(self, text: str):
        """Run a query against the existing or newly built index."""
        import time
        start_t = time.time()
        query_engine = self.get_query_engine()
        response = await query_engine.aquery(text)
        print(f"πŸ” Async query completed in {time.time() - start_t:.2f}s")
        return response


if __name__ == "__main__":
    DATA_DIR = "../companyData1"
    DB_PATH = "../chromafast_db"

    manager = VectorDBManager(db_path=DB_PATH, collection_name="DB_collection")

    # Detect if DB exists and has embeddings
    if not os.path.exists(DB_PATH) or manager.is_collection_empty():
        print("πŸ†• No existing embeddings found. Building new Chroma DB...")
        manager.build_index_from_documents(DATA_DIR)
    else:
        print("πŸ“‚ Existing Chroma DB found. Loading it...")
        manager.load_existing_index()

    # Test query
    question = "What are some of the main contributions of this new bitswits?"
    response = manager.query(question)
    print("\nπŸ” Query Result:\n")
    print(response)