Spaces:

ahmzakif
/

Fraud-Chatbot

Sleeping

App Files Files Community

Fraud-Chatbot / test /test_vector_store.py

ahmzakif

feat: add new project

fd99b61 verified about 1 month ago

raw

history blame contribute delete

6.95 kB

	"""Test script to verify CSV integration into RAG system."""

	import sys
	from pathlib import Path

	# Add parent directory to path to allow importing src modules
	sys.path.insert(0, str(Path(__file__).parent.parent))

	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	def test_csv_document_generation():
	"""Test CSV document generation."""
	print("=" * 60)
	print("TEST 1: CSV Document Generation")
	print("=" * 60)

	try:
	from src.rag.csv_document_generator import CSVDocumentGenerator

	csv_path = Path("data/fraudTrain.csv")
	generator = CSVDocumentGenerator(csv_path, sample_size=1050000)

	print(f"\n✓ Created CSVDocumentGenerator")
	print(f" CSV Path: {csv_path}")
	print(f" Sample Size: 1,050,000 rows")

	# Generate all documents
	documents = generator.generate_all_documents()

	print(f"\n✓ Generated {len(documents)} documents from CSV")

	# Show sample document
	if documents:
	print(f"\n--- Sample Document ---")
	print(f"Type: {documents[0].metadata.get('type', 'N/A')}")
	print(f"Source: {documents[0].metadata.get('source', 'N/A')}")
	print(f"\nContent Preview:")
	print(documents[0].page_content[:400])
	print("...")

	return True

	except Exception as e:
	print(f"\n❌ Error: {str(e)}")
	import traceback
	traceback.print_exc()
	return False


	def test_vector_store_integration():
	"""Test vector store integration with CSV documents."""
	print("\n" + "=" * 60)
	print("TEST 2: Vector Store Integration")
	print("=" * 60)

	try:
	from src.rag.document_loader import DocumentLoader
	from src.rag.vector_store import VectorStore
	from src.config.config import settings

	document_loader = DocumentLoader()

	# Load CSV insights
	csv_path = settings.data_dir / "fraudTrain.csv"
	print(f"\n✓ Loading CSV insights from {csv_path}")

	csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
	print(f"✓ Loaded {len(csv_documents)} CSV documents")

	# Create vector store and add documents
	print(f"\n✓ Creating vector store...")
	vector_store = VectorStore()
	vector_store.add_documents(csv_documents)
	print(f"✓ Added {len(csv_documents)} documents to vector store")

	# Test similarity search
	print(f"\n✓ Testing similarity search...")
	query = "What are fraud patterns in grocery stores?"
	results = vector_store.similarity_search(query, k=3)

	print(f"\n✓ Found {len(results)} relevant documents for query:")
	print(f" '{query}'")

	for i, doc in enumerate(results, 1):
	print(f"\n--- Result {i} ---")
	print(f"Type: {doc.metadata.get('type', 'N/A')}")
	print(f"Category: {doc.metadata.get('category', 'N/A')}")
	print(f"Content: {doc.page_content[:200]}...")

	return True

	except Exception as e:
	print(f"\n❌ Error: {str(e)}")
	import traceback
	traceback.print_exc()
	return False


	def test_full_rag_integration():
	"""Test full RAG integration with both PDF and CSV."""
	print("\n" + "=" * 60)
	print("TEST 3: Full RAG Integration (PDF + CSV)")
	print("=" * 60)

	try:
	from src.rag.document_loader import DocumentLoader
	from src.rag.vector_store import VectorStore
	from src.config.config import settings

	document_loader = DocumentLoader(
	chunk_size=settings.chunk_size,
	chunk_overlap=settings.chunk_overlap,
	)

	all_documents = []

	# Load PDF documents
	print(f"\n✓ Loading PDF documents...")
	pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir)
	if pdf_documents:
	all_documents.extend(pdf_documents)
	print(f"✓ Loaded {len(pdf_documents)} PDF documents")

	# Load CSV insights
	print(f"\n✓ Loading CSV insights...")
	csv_path = settings.data_dir / "fraudTrain.csv"
	if csv_path.exists():
	csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
	all_documents.extend(csv_documents)
	print(f"✓ Loaded {len(csv_documents)} CSV documents")

	# Create vector store
	print(f"\n✓ Creating unified vector store...")
	vector_store = VectorStore()
	vector_store.add_documents(all_documents)
	print(f"✓ Total documents in RAG: {len(all_documents)}")
	print(f" - PDF documents: {len(pdf_documents)}")
	print(f" - CSV documents: {len(csv_documents)}")

	# Test queries
	test_queries = [
	"What are common fraud patterns?",
	"Fraud rate in grocery transactions",
	"High risk merchants",
	]

	print(f"\n✓ Testing queries with unified RAG...")
	for query in test_queries:
	results = vector_store.similarity_search(query, k=2)
	print(f"\nQuery: '{query}'")
	print(f" Found {len(results)} results")
	for doc in results:
	doc_type = doc.metadata.get('type', 'pdf')
	source = doc.metadata.get('source', 'N/A')
	print(f" - Source: {source} (Type: {doc_type})")

	return True

	except Exception as e:
	print(f"\n❌ Error: {str(e)}")
	import traceback
	traceback.print_exc()
	return False


	if __name__ == "__main__":
	print("\n" + "=" * 60)
	print("CSV RAG INTEGRATION VERIFICATION")
	print("=" * 60)

	results = []

	# Run tests
	results.append(("CSV Document Generation", test_csv_document_generation()))
	results.append(("Vector Store Integration", test_vector_store_integration()))
	results.append(("Full RAG Integration", test_full_rag_integration()))

	# Summary
	print("\n" + "=" * 60)
	print("TEST SUMMARY")
	print("=" * 60)

	for test_name, passed in results:
	status = "✅ PASSED" if passed else "❌ FAILED"
	print(f"{status} - {test_name}")

	all_passed = all(result[1] for result in results)

	if all_passed:
	print("\n🎉 All tests passed! CSV integration is working correctly.")
	else:
	print("\n⚠️ Some tests failed. Please check the errors above.")

	print("=" * 60)