Multimodel_Rag / scripts /test_day5.py
Dhrumil Parikh
deploy GeminiRAG
cdc55f4
Raw
History Blame Contribute Delete
5.75 kB
"""
Day 5 verification: embed + ChromaDB add/search end-to-end.
Run: python scripts/test_day5.py
"""
import json
import os
import sys
import uuid
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
load_dotenv(Path(__file__).parent.parent / ".env")
sys.path.insert(0, str(Path(__file__).parent.parent))
from sqlmodel import Session, create_engine
from app.models.db import Job, JobStatus, User, UserRole, UsageLog
from app.observability.logging import configure_logging
configure_logging()
DATABASE_URL = os.environ["DATABASE_URL"]
engine = create_engine(DATABASE_URL, echo=False)
def get_or_create_test_user(db):
from sqlmodel import select
user = db.exec(select(User).where(User.email == "day5test@test.com")).first()
if not user:
from app.security import hash_password
user = User(
email="day5test@test.com",
hashed_password=hash_password("test123"),
role=UserRole.user,
is_active=True,
)
db.add(user)
db.commit()
db.refresh(user)
return user
def test_embed_query():
print("\n=== Test: embed_query ===")
from app.config import settings
from app.rag.embedder import embed_query
vec = embed_query("What is machine learning?", settings)
assert len(vec) == 768, f"Expected 768-dim vector, got {len(vec)}"
print(f"[PASS] embed_query: dim={len(vec)}, first3={[round(v,4) for v in vec[:3]]}")
return vec
def test_embed_chunks_and_chromadb(query_vec):
print("\n=== Test: embed_chunks + ChromaDB ===")
from app.config import settings
from app.rag.embedder import embed_chunks
from app.rag.vectorstore import (
get_chroma_client, get_or_create_collection,
add_chunks, search, delete_job_chunks
)
with Session(engine) as db:
user = get_or_create_test_user(db)
job_id = str(uuid.uuid4())
fake_job_id = uuid.UUID(job_id)
# Create a fake job record for log_llm_call
job = Job(
id=fake_job_id,
user_id=user.id,
filename="test_rag.pdf",
file_type="pdf",
file_path="C:/tmp/fake.pdf",
file_size_bytes=1000,
status=JobStatus.processing,
created_at=datetime.utcnow(),
updated_at=datetime.utcnow(),
)
db.add(job)
db.commit()
# Chunker
from app.rag.chunker import chunk_text
text = "[Page 1]\n" + "Machine learning is a subset of AI that enables computers to learn. " * 20
text += "\n[Page 2]\n" + "Deep learning uses neural networks with many layers. " * 20
chunks = chunk_text(text, job_id, "test_rag.pdf", "pdf",
chunk_size=100, overlap=20)
print(f"chunks produced: {len(chunks)}")
assert len(chunks) > 0, "No chunks produced!"
# Embedder
embeddings = embed_chunks(chunks, user.id, fake_job_id, settings, db)
assert len(embeddings) == len(chunks), "Embedding count mismatch"
assert len(embeddings[0]) == 768, f"Wrong embedding dim: {len(embeddings[0])}"
print(f"[PASS] embed_chunks: {len(embeddings)} vectors, dim={len(embeddings[0])}")
# Check usage_logs
from sqlmodel import select
logs = db.exec(select(UsageLog).where(UsageLog.job_id == fake_job_id)).all()
print(f"usage_logs for embed job: {len(logs)}")
assert len(logs) > 0, "No usage logs for embed_chunks!"
for lg in logs:
print(f" endpoint={lg.endpoint} tokens={lg.prompt_tokens} latency={lg.latency_ms}ms")
# ChromaDB
client = get_chroma_client(settings)
collection = get_or_create_collection(client, settings)
delete_job_chunks(collection, job_id) # clean before add
add_chunks(collection, chunks, embeddings)
print(f"[PASS] add_chunks: {len(chunks)} chunks upserted")
# Search
results = search(collection, query_vec, top_k=3, job_ids=[job_id])
print(f"search results: {len(results)}")
assert len(results) > 0, "No search results!"
for r in results:
print(f" score={round(r['score'],4)} page={r['page_or_segment']} text={r['text'][:60]}")
print("[PASS] ChromaDB search returning ranked results")
return len(chunks)
def test_full_pipeline():
print("\n=== Test: full pipeline via test PDF processor ===")
from app.config import settings
from app.rag.chunker import chunk_text
from app.rag.embedder import embed_chunks
from app.rag.vectorstore import (
get_chroma_client, get_or_create_collection, add_chunks
)
import chromadb
# Verify ChromaDB collection exists with documents
client = get_chroma_client(settings)
collection = get_or_create_collection(client, settings)
count = collection.count()
print(f"[INFO] ChromaDB collection '{settings.CHROMA_COLLECTION}' has {count} documents")
print("[PASS] ChromaDB collection accessible")
def main():
try:
query_vec = test_embed_query()
chunk_count = test_embed_chunks_and_chromadb(query_vec)
test_full_pipeline()
print("\n" + "="*60)
print("DAY 5 VERIFICATION SUMMARY")
print("="*60)
print(f"[PASS] embed_query: 768-dim vector")
print(f"[PASS] chunk_text + embed_chunks: {chunk_count} chunks with real embeddings")
print(f"[PASS] ChromaDB: add + search + usage_logs populated")
print(f"[PASS] Full pipeline wiring verified")
except Exception as e:
import traceback
print(f"\n[FAIL] {e}")
traceback.print_exc()
sys.exit(1)
if __name__ == "__main__":
main()