|
|
""" |
|
|
Build vector database from processed chunks |
|
|
Main pipeline for Step 3 |
|
|
""" |
|
|
|
|
|
import json |
|
|
import logging |
|
|
import time |
|
|
from pathlib import Path |
|
|
|
|
|
from .config import CHUNKS_OUTPUT_FILE, LOG_LEVEL, LOG_FORMAT, PINECONE_API_KEY |
|
|
from .embeddings import EmbeddingGenerator |
|
|
from .vector_db import LegalVectorDB |
|
|
|
|
|
|
|
|
try: |
|
|
from .pinecone_vector_db import PineconeLegalVectorDB |
|
|
USE_PINECONE = bool(PINECONE_API_KEY) |
|
|
except ImportError: |
|
|
USE_PINECONE = False |
|
|
PineconeLegalVectorDB = None |
|
|
|
|
|
|
|
|
logging.basicConfig(level=LOG_LEVEL, format=LOG_FORMAT) |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def load_chunks(chunks_file: Path): |
|
|
"""Load processed chunks from JSON""" |
|
|
logger.info(f"Loading chunks from {chunks_file}") |
|
|
|
|
|
if not chunks_file.exists(): |
|
|
raise FileNotFoundError(f"Chunks file not found: {chunks_file}") |
|
|
|
|
|
with open(chunks_file, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
|
|
|
chunks = data['chunks'] |
|
|
logger.info(f"Loaded {len(chunks)} chunks") |
|
|
|
|
|
return chunks |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main pipeline to build vector database""" |
|
|
print("=" * 80) |
|
|
print("Building Vector Database for Nepal Legal Documents") |
|
|
print("=" * 80) |
|
|
|
|
|
logger.info("=" * 80) |
|
|
logger.info("Starting Vector Database Build Pipeline") |
|
|
logger.info("=" * 80) |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
try: |
|
|
|
|
|
print("\nStep 1: Loading processed chunks...") |
|
|
chunks = load_chunks(CHUNKS_OUTPUT_FILE) |
|
|
print(f"β Loaded {len(chunks)} chunks") |
|
|
|
|
|
|
|
|
print("\nStep 2: Initializing embedding model...") |
|
|
logger.info("Initializing embedding model (this may take a moment on first run)...") |
|
|
embedder = EmbeddingGenerator() |
|
|
print(f"β Model loaded: {embedder.model_name}") |
|
|
print(f"β Embedding dimension: {embedder.embedding_dim}") |
|
|
|
|
|
|
|
|
print("\nStep 3: Generating embeddings for all chunks...") |
|
|
print("(This will take a minute or two...)") |
|
|
texts = [chunk['text'] for chunk in chunks] |
|
|
embeddings = embedder.generate_embeddings_batch(texts, show_progress=True) |
|
|
|
|
|
print(f"β Generated {len(embeddings)} embeddings") |
|
|
print(f"β Embedding shape: {embeddings.shape}") |
|
|
|
|
|
|
|
|
print("\nStep 4: Initializing vector database...") |
|
|
if USE_PINECONE: |
|
|
print("Using Pinecone cloud vector database...") |
|
|
vector_db = PineconeLegalVectorDB() |
|
|
print(f"β Connected to Pinecone index: {vector_db.index_name}") |
|
|
else: |
|
|
print("Using local ChromaDB vector database...") |
|
|
vector_db = LegalVectorDB() |
|
|
print(f"β Database initialized at: {vector_db.persist_directory}") |
|
|
|
|
|
|
|
|
print("\nStep 5: Adding chunks to vector database...") |
|
|
vector_db.add_chunks(chunks, embeddings.tolist()) |
|
|
|
|
|
final_count = vector_db.get_count() |
|
|
print(f"β Successfully indexed {final_count} chunks") |
|
|
|
|
|
|
|
|
elapsed_time = time.time() - start_time |
|
|
|
|
|
|
|
|
print("\n" + "=" * 80) |
|
|
print("VECTOR DATABASE BUILD COMPLETE!") |
|
|
print("=" * 80) |
|
|
print(f"Total chunks indexed: {final_count}") |
|
|
print(f"Embedding dimension: {embedder.embedding_dim}") |
|
|
print(f"Embedding model: {embedder.model_name}") |
|
|
print(f"Build time: {elapsed_time:.2f} seconds") |
|
|
if USE_PINECONE: |
|
|
print(f"Database: Pinecone cloud index '{vector_db.index_name}'") |
|
|
else: |
|
|
print(f"Database location: {vector_db.persist_directory}") |
|
|
print("=" * 80) |
|
|
|
|
|
logger.info("=" * 80) |
|
|
logger.info("Vector Database Build Complete!") |
|
|
logger.info(f"Total chunks indexed: {final_count}") |
|
|
logger.info(f"Build time: {elapsed_time:.2f} seconds") |
|
|
logger.info("=" * 80) |
|
|
|
|
|
print(f"\nβ Vector database built successfully!") |
|
|
print(f"β Ready for retrieval testing") |
|
|
print(f"\nNext step: Run 'python -m module_a.test_retrieval' to test queries") |
|
|
|
|
|
return 0 |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Build failed: {e}", exc_info=True) |
|
|
print(f"\nβ Build failed: {e}") |
|
|
return 1 |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
exit(main()) |
|
|
|