Spaces:

dhrumilparikh
/

Multimodel_Rag

Sleeping

Dhrumil Parikh

deploy GeminiRAG

cdc55f4 about 1 month ago

5.75 kB

	"""
	Day 5 verification: embed + ChromaDB add/search end-to-end.
	Run: python scripts/test_day5.py
	"""
	import json
	import os
	import sys
	import uuid
	from datetime import datetime
	from pathlib import Path

	from dotenv import load_dotenv
	load_dotenv(Path(__file__).parent.parent / ".env")

	sys.path.insert(0, str(Path(__file__).parent.parent))

	from sqlmodel import Session, create_engine

	from app.models.db import Job, JobStatus, User, UserRole, UsageLog
	from app.observability.logging import configure_logging

	configure_logging()

	DATABASE_URL = os.environ["DATABASE_URL"]
	engine = create_engine(DATABASE_URL, echo=False)


	def get_or_create_test_user(db):
	from sqlmodel import select
	user = db.exec(select(User).where(User.email == "day5test@test.com")).first()
	if not user:
	from app.security import hash_password
	user = User(
	email="day5test@test.com",
	hashed_password=hash_password("test123"),
	role=UserRole.user,
	is_active=True,
	)
	db.add(user)
	db.commit()
	db.refresh(user)
	return user


	def test_embed_query():
	print("\n=== Test: embed_query ===")
	from app.config import settings
	from app.rag.embedder import embed_query

	vec = embed_query("What is machine learning?", settings)
	assert len(vec) == 768, f"Expected 768-dim vector, got {len(vec)}"
	print(f"[PASS] embed_query: dim={len(vec)}, first3={[round(v,4) for v in vec[:3]]}")
	return vec


	def test_embed_chunks_and_chromadb(query_vec):
	print("\n=== Test: embed_chunks + ChromaDB ===")
	from app.config import settings
	from app.rag.embedder import embed_chunks
	from app.rag.vectorstore import (
	get_chroma_client, get_or_create_collection,
	add_chunks, search, delete_job_chunks
	)

	with Session(engine) as db:
	user = get_or_create_test_user(db)

	job_id = str(uuid.uuid4())
	fake_job_id = uuid.UUID(job_id)

	# Create a fake job record for log_llm_call
	job = Job(
	id=fake_job_id,
	user_id=user.id,
	filename="test_rag.pdf",
	file_type="pdf",
	file_path="C:/tmp/fake.pdf",
	file_size_bytes=1000,
	status=JobStatus.processing,
	created_at=datetime.utcnow(),
	updated_at=datetime.utcnow(),
	)
	db.add(job)
	db.commit()

	# Chunker
	from app.rag.chunker import chunk_text
	text = "[Page 1]\n" + "Machine learning is a subset of AI that enables computers to learn. " * 20
	text += "\n[Page 2]\n" + "Deep learning uses neural networks with many layers. " * 20
	chunks = chunk_text(text, job_id, "test_rag.pdf", "pdf",
	chunk_size=100, overlap=20)
	print(f"chunks produced: {len(chunks)}")
	assert len(chunks) > 0, "No chunks produced!"

	# Embedder
	embeddings = embed_chunks(chunks, user.id, fake_job_id, settings, db)
	assert len(embeddings) == len(chunks), "Embedding count mismatch"
	assert len(embeddings[0]) == 768, f"Wrong embedding dim: {len(embeddings[0])}"
	print(f"[PASS] embed_chunks: {len(embeddings)} vectors, dim={len(embeddings[0])}")

	# Check usage_logs
	from sqlmodel import select
	logs = db.exec(select(UsageLog).where(UsageLog.job_id == fake_job_id)).all()
	print(f"usage_logs for embed job: {len(logs)}")
	assert len(logs) > 0, "No usage logs for embed_chunks!"
	for lg in logs:
	print(f" endpoint={lg.endpoint} tokens={lg.prompt_tokens} latency={lg.latency_ms}ms")

	# ChromaDB
	client = get_chroma_client(settings)
	collection = get_or_create_collection(client, settings)

	delete_job_chunks(collection, job_id) # clean before add
	add_chunks(collection, chunks, embeddings)
	print(f"[PASS] add_chunks: {len(chunks)} chunks upserted")

	# Search
	results = search(collection, query_vec, top_k=3, job_ids=[job_id])
	print(f"search results: {len(results)}")
	assert len(results) > 0, "No search results!"
	for r in results:
	print(f" score={round(r['score'],4)} page={r['page_or_segment']} text={r['text'][:60]}")

	print("[PASS] ChromaDB search returning ranked results")
	return len(chunks)


	def test_full_pipeline():
	print("\n=== Test: full pipeline via test PDF processor ===")
	from app.config import settings
	from app.rag.chunker import chunk_text
	from app.rag.embedder import embed_chunks
	from app.rag.vectorstore import (
	get_chroma_client, get_or_create_collection, add_chunks
	)
	import chromadb

	# Verify ChromaDB collection exists with documents
	client = get_chroma_client(settings)
	collection = get_or_create_collection(client, settings)
	count = collection.count()
	print(f"[INFO] ChromaDB collection '{settings.CHROMA_COLLECTION}' has {count} documents")
	print("[PASS] ChromaDB collection accessible")


	def main():
	try:
	query_vec = test_embed_query()
	chunk_count = test_embed_chunks_and_chromadb(query_vec)
	test_full_pipeline()

	print("\n" + "="*60)
	print("DAY 5 VERIFICATION SUMMARY")
	print("="*60)
	print(f"[PASS] embed_query: 768-dim vector")
	print(f"[PASS] chunk_text + embed_chunks: {chunk_count} chunks with real embeddings")
	print(f"[PASS] ChromaDB: add + search + usage_logs populated")
	print(f"[PASS] Full pipeline wiring verified")
	except Exception as e:
	import traceback
	print(f"\n[FAIL] {e}")
	traceback.print_exc()
	sys.exit(1)


	if __name__ == "__main__":
	main()