Spaces:
Sleeping
Sleeping
| """Test script to verify CSV integration into RAG system.""" | |
| import sys | |
| from pathlib import Path | |
| # Add parent directory to path to allow importing src modules | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| import logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def test_csv_document_generation(): | |
| """Test CSV document generation.""" | |
| print("=" * 60) | |
| print("TEST 1: CSV Document Generation") | |
| print("=" * 60) | |
| try: | |
| from src.rag.csv_document_generator import CSVDocumentGenerator | |
| csv_path = Path("data/fraudTrain.csv") | |
| generator = CSVDocumentGenerator(csv_path, sample_size=1050000) | |
| print(f"\nโ Created CSVDocumentGenerator") | |
| print(f" CSV Path: {csv_path}") | |
| print(f" Sample Size: 1,050,000 rows") | |
| # Generate all documents | |
| documents = generator.generate_all_documents() | |
| print(f"\nโ Generated {len(documents)} documents from CSV") | |
| # Show sample document | |
| if documents: | |
| print(f"\n--- Sample Document ---") | |
| print(f"Type: {documents[0].metadata.get('type', 'N/A')}") | |
| print(f"Source: {documents[0].metadata.get('source', 'N/A')}") | |
| print(f"\nContent Preview:") | |
| print(documents[0].page_content[:400]) | |
| print("...") | |
| return True | |
| except Exception as e: | |
| print(f"\nโ Error: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def test_vector_store_integration(): | |
| """Test vector store integration with CSV documents.""" | |
| print("\n" + "=" * 60) | |
| print("TEST 2: Vector Store Integration") | |
| print("=" * 60) | |
| try: | |
| from src.rag.document_loader import DocumentLoader | |
| from src.rag.vector_store import VectorStore | |
| from src.config.config import settings | |
| document_loader = DocumentLoader() | |
| # Load CSV insights | |
| csv_path = settings.data_dir / "fraudTrain.csv" | |
| print(f"\nโ Loading CSV insights from {csv_path}") | |
| csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000) | |
| print(f"โ Loaded {len(csv_documents)} CSV documents") | |
| # Create vector store and add documents | |
| print(f"\nโ Creating vector store...") | |
| vector_store = VectorStore() | |
| vector_store.add_documents(csv_documents) | |
| print(f"โ Added {len(csv_documents)} documents to vector store") | |
| # Test similarity search | |
| print(f"\nโ Testing similarity search...") | |
| query = "What are fraud patterns in grocery stores?" | |
| results = vector_store.similarity_search(query, k=3) | |
| print(f"\nโ Found {len(results)} relevant documents for query:") | |
| print(f" '{query}'") | |
| for i, doc in enumerate(results, 1): | |
| print(f"\n--- Result {i} ---") | |
| print(f"Type: {doc.metadata.get('type', 'N/A')}") | |
| print(f"Category: {doc.metadata.get('category', 'N/A')}") | |
| print(f"Content: {doc.page_content[:200]}...") | |
| return True | |
| except Exception as e: | |
| print(f"\nโ Error: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| def test_full_rag_integration(): | |
| """Test full RAG integration with both PDF and CSV.""" | |
| print("\n" + "=" * 60) | |
| print("TEST 3: Full RAG Integration (PDF + CSV)") | |
| print("=" * 60) | |
| try: | |
| from src.rag.document_loader import DocumentLoader | |
| from src.rag.vector_store import VectorStore | |
| from src.config.config import settings | |
| document_loader = DocumentLoader( | |
| chunk_size=settings.chunk_size, | |
| chunk_overlap=settings.chunk_overlap, | |
| ) | |
| all_documents = [] | |
| # Load PDF documents | |
| print(f"\nโ Loading PDF documents...") | |
| pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir) | |
| if pdf_documents: | |
| all_documents.extend(pdf_documents) | |
| print(f"โ Loaded {len(pdf_documents)} PDF documents") | |
| # Load CSV insights | |
| print(f"\nโ Loading CSV insights...") | |
| csv_path = settings.data_dir / "fraudTrain.csv" | |
| if csv_path.exists(): | |
| csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000) | |
| all_documents.extend(csv_documents) | |
| print(f"โ Loaded {len(csv_documents)} CSV documents") | |
| # Create vector store | |
| print(f"\nโ Creating unified vector store...") | |
| vector_store = VectorStore() | |
| vector_store.add_documents(all_documents) | |
| print(f"โ Total documents in RAG: {len(all_documents)}") | |
| print(f" - PDF documents: {len(pdf_documents)}") | |
| print(f" - CSV documents: {len(csv_documents)}") | |
| # Test queries | |
| test_queries = [ | |
| "What are common fraud patterns?", | |
| "Fraud rate in grocery transactions", | |
| "High risk merchants", | |
| ] | |
| print(f"\nโ Testing queries with unified RAG...") | |
| for query in test_queries: | |
| results = vector_store.similarity_search(query, k=2) | |
| print(f"\nQuery: '{query}'") | |
| print(f" Found {len(results)} results") | |
| for doc in results: | |
| doc_type = doc.metadata.get('type', 'pdf') | |
| source = doc.metadata.get('source', 'N/A') | |
| print(f" - Source: {source} (Type: {doc_type})") | |
| return True | |
| except Exception as e: | |
| print(f"\nโ Error: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| return False | |
| if __name__ == "__main__": | |
| print("\n" + "=" * 60) | |
| print("CSV RAG INTEGRATION VERIFICATION") | |
| print("=" * 60) | |
| results = [] | |
| # Run tests | |
| results.append(("CSV Document Generation", test_csv_document_generation())) | |
| results.append(("Vector Store Integration", test_vector_store_integration())) | |
| results.append(("Full RAG Integration", test_full_rag_integration())) | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("TEST SUMMARY") | |
| print("=" * 60) | |
| for test_name, passed in results: | |
| status = "โ PASSED" if passed else "โ FAILED" | |
| print(f"{status} - {test_name}") | |
| all_passed = all(result[1] for result in results) | |
| if all_passed: | |
| print("\n๐ All tests passed! CSV integration is working correctly.") | |
| else: | |
| print("\nโ ๏ธ Some tests failed. Please check the errors above.") | |
| print("=" * 60) | |