"""Test script to verify CSV integration into RAG system.""" import sys from pathlib import Path # Add parent directory to path to allow importing src modules sys.path.insert(0, str(Path(__file__).parent.parent)) import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def test_csv_document_generation(): """Test CSV document generation.""" print("=" * 60) print("TEST 1: CSV Document Generation") print("=" * 60) try: from src.rag.csv_document_generator import CSVDocumentGenerator csv_path = Path("data/fraudTrain.csv") generator = CSVDocumentGenerator(csv_path, sample_size=1050000) print(f"\n✓ Created CSVDocumentGenerator") print(f" CSV Path: {csv_path}") print(f" Sample Size: 1,050,000 rows") # Generate all documents documents = generator.generate_all_documents() print(f"\n✓ Generated {len(documents)} documents from CSV") # Show sample document if documents: print(f"\n--- Sample Document ---") print(f"Type: {documents[0].metadata.get('type', 'N/A')}") print(f"Source: {documents[0].metadata.get('source', 'N/A')}") print(f"\nContent Preview:") print(documents[0].page_content[:400]) print("...") return True except Exception as e: print(f"\n❌ Error: {str(e)}") import traceback traceback.print_exc() return False def test_vector_store_integration(): """Test vector store integration with CSV documents.""" print("\n" + "=" * 60) print("TEST 2: Vector Store Integration") print("=" * 60) try: from src.rag.document_loader import DocumentLoader from src.rag.vector_store import VectorStore from src.config.config import settings document_loader = DocumentLoader() # Load CSV insights csv_path = settings.data_dir / "fraudTrain.csv" print(f"\n✓ Loading CSV insights from {csv_path}") csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000) print(f"✓ Loaded {len(csv_documents)} CSV documents") # Create vector store and add documents print(f"\n✓ Creating vector store...") vector_store = VectorStore() vector_store.add_documents(csv_documents) print(f"✓ Added {len(csv_documents)} documents to vector store") # Test similarity search print(f"\n✓ Testing similarity search...") query = "What are fraud patterns in grocery stores?" results = vector_store.similarity_search(query, k=3) print(f"\n✓ Found {len(results)} relevant documents for query:") print(f" '{query}'") for i, doc in enumerate(results, 1): print(f"\n--- Result {i} ---") print(f"Type: {doc.metadata.get('type', 'N/A')}") print(f"Category: {doc.metadata.get('category', 'N/A')}") print(f"Content: {doc.page_content[:200]}...") return True except Exception as e: print(f"\n❌ Error: {str(e)}") import traceback traceback.print_exc() return False def test_full_rag_integration(): """Test full RAG integration with both PDF and CSV.""" print("\n" + "=" * 60) print("TEST 3: Full RAG Integration (PDF + CSV)") print("=" * 60) try: from src.rag.document_loader import DocumentLoader from src.rag.vector_store import VectorStore from src.config.config import settings document_loader = DocumentLoader( chunk_size=settings.chunk_size, chunk_overlap=settings.chunk_overlap, ) all_documents = [] # Load PDF documents print(f"\n✓ Loading PDF documents...") pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir) if pdf_documents: all_documents.extend(pdf_documents) print(f"✓ Loaded {len(pdf_documents)} PDF documents") # Load CSV insights print(f"\n✓ Loading CSV insights...") csv_path = settings.data_dir / "fraudTrain.csv" if csv_path.exists(): csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000) all_documents.extend(csv_documents) print(f"✓ Loaded {len(csv_documents)} CSV documents") # Create vector store print(f"\n✓ Creating unified vector store...") vector_store = VectorStore() vector_store.add_documents(all_documents) print(f"✓ Total documents in RAG: {len(all_documents)}") print(f" - PDF documents: {len(pdf_documents)}") print(f" - CSV documents: {len(csv_documents)}") # Test queries test_queries = [ "What are common fraud patterns?", "Fraud rate in grocery transactions", "High risk merchants", ] print(f"\n✓ Testing queries with unified RAG...") for query in test_queries: results = vector_store.similarity_search(query, k=2) print(f"\nQuery: '{query}'") print(f" Found {len(results)} results") for doc in results: doc_type = doc.metadata.get('type', 'pdf') source = doc.metadata.get('source', 'N/A') print(f" - Source: {source} (Type: {doc_type})") return True except Exception as e: print(f"\n❌ Error: {str(e)}") import traceback traceback.print_exc() return False if __name__ == "__main__": print("\n" + "=" * 60) print("CSV RAG INTEGRATION VERIFICATION") print("=" * 60) results = [] # Run tests results.append(("CSV Document Generation", test_csv_document_generation())) results.append(("Vector Store Integration", test_vector_store_integration())) results.append(("Full RAG Integration", test_full_rag_integration())) # Summary print("\n" + "=" * 60) print("TEST SUMMARY") print("=" * 60) for test_name, passed in results: status = "✅ PASSED" if passed else "❌ FAILED" print(f"{status} - {test_name}") all_passed = all(result[1] for result in results) if all_passed: print("\n🎉 All tests passed! CSV integration is working correctly.") else: print("\n⚠️ Some tests failed. Please check the errors above.") print("=" * 60)