RAG-Based-Product-Inquiry-ChatBot / test_scripts /test_vector_db_population.py
Yoma
Initial HF Spaces deployment without chroma_db
625e9e8
import logging
import os
import sys
import shutil
import chromadb
from chromadb.config import Settings
# Add the root directory to the Python path to allow for absolute imports
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from vector_db_manager import run_etl_pipeline
from document_processor import prepare_product_documents, prepare_review_documents
# Configure logging to display info level messages
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# --- Test Configuration ---
# Use a lightweight model for fast testing
TEST_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
# Use a temporary directory for the test database
TEST_DB_PATH = "./chroma_db_test"
def test_vector_db_population():
"""
Tests the full ETL pipeline: chunking, embedding, and populating ChromaDB.
It verifies that collections are created and populated correctly.
"""
logger.info("--- Starting Test: Vector DB Population ---")
# Define paths to the data files
base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
products_file = os.path.join(base_dir, 'products.json')
reviews_file = os.path.join(base_dir, 'product_reviews.json')
db_manager = None
client = None
# Clean up any previous test database directory
if os.path.exists(TEST_DB_PATH):
logger.warning(f"Removing existing test database at: {TEST_DB_PATH}")
shutil.rmtree(TEST_DB_PATH)
try:
# --- 1. Run the ETL Pipeline ---
logger.info(f"Running ETL pipeline with model '{TEST_MODEL}' into '{TEST_DB_PATH}'")
db_manager = run_etl_pipeline(
products_file=products_file,
reviews_file=reviews_file,
db_path=TEST_DB_PATH,
model_name=TEST_MODEL
)
logger.info("ETL Pipeline finished.")
# --- 2. Verification Step ---
logger.info("\n--- Verifying Database Content ---")
assert os.path.exists(TEST_DB_PATH), "Database directory was not created."
client = chromadb.PersistentClient(
path=TEST_DB_PATH,
settings=Settings(allow_reset=True)
)
# --- Verify Products Collection ---
logger.info("[Verifying 'products' collection...]")
products_collection = client.get_collection(name="products")
assert products_collection is not None, "'products' collection not found."
# Get expected count from the source
expected_products_count = len(prepare_product_documents(products_file))
actual_products_count = products_collection.count()
assert actual_products_count == expected_products_count, \
f"Expected {expected_products_count} products, but found {actual_products_count} in DB."
logger.info(f"SUCCESS: 'products' collection count is correct ({actual_products_count}).")
# Perform a sample query
query_result = products_collection.query(query_texts=["lightweight laptop"], n_results=1)
assert query_result['ids'][0], "Sample query on 'products' collection returned no results."
logger.info(f"SUCCESS: Sample query on 'products' returned: {query_result['ids'][0][0]}")
# --- Verify Reviews Collection ---
logger.info("\n[Verifying 'reviews' collection...]")
reviews_collection = client.get_collection(name="reviews")
assert reviews_collection is not None, "'reviews' collection not found."
# Get expected count from the source
expected_reviews_count = len(prepare_review_documents(reviews_file, products_file))
actual_reviews_count = reviews_collection.count()
assert actual_reviews_count == expected_reviews_count, \
f"Expected {expected_reviews_count} reviews, but found {actual_reviews_count} in DB."
logger.info(f"SUCCESS: 'reviews' collection count is correct ({actual_reviews_count}).")
# Perform a sample query
query_result = reviews_collection.query(query_texts=["disappointed with battery"], n_results=1)
assert query_result['ids'][0], "Sample query on 'reviews' collection returned no results."
logger.info(f"SUCCESS: Sample query on 'reviews' returned: {query_result['ids'][0][0]}")
# Reset the client to release file locks before cleanup
logger.info("Shutting down clients to release file locks.")
if db_manager:
db_manager.shutdown()
if client:
client.reset()
finally:
# --- 3. Cleanup Step ---
logger.info("\n--- Cleaning up test database ---")
# Explicitly shut down both client instances to release file locks
if db_manager:
logger.info("Shutting down ETL client...")
db_manager.shutdown()
if client:
logger.info("Shutting down verification client...")
client.reset()
# Implement a retry mechanism for shutil.rmtree to handle Windows file lock race conditions
if os.path.exists(TEST_DB_PATH):
for i in range(5): # Retry up to 5 times
try:
shutil.rmtree(TEST_DB_PATH)
logger.info(f"Successfully removed test database at: {TEST_DB_PATH}")
break # Exit loop if successful
except (PermissionError, OSError) as e:
logger.warning(f"Cleanup attempt {i+1} failed: {e}. Retrying in 2 seconds...")
import time
time.sleep(2)
else: # This 'else' belongs to the 'for' loop
logger.error("Failed to remove test database directory after multiple attempts. It may need to be removed manually.")
logger.info("\n--- Test Finished Successfully ---")
if __name__ == "__main__":
test_vector_db_population()