Fraud-Chatbot / test /test_vector_store.py
ahmzakif's picture
feat: add new project
fd99b61 verified
"""Test script to verify CSV integration into RAG system."""
import sys
from pathlib import Path
# Add parent directory to path to allow importing src modules
sys.path.insert(0, str(Path(__file__).parent.parent))
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def test_csv_document_generation():
"""Test CSV document generation."""
print("=" * 60)
print("TEST 1: CSV Document Generation")
print("=" * 60)
try:
from src.rag.csv_document_generator import CSVDocumentGenerator
csv_path = Path("data/fraudTrain.csv")
generator = CSVDocumentGenerator(csv_path, sample_size=1050000)
print(f"\nโœ“ Created CSVDocumentGenerator")
print(f" CSV Path: {csv_path}")
print(f" Sample Size: 1,050,000 rows")
# Generate all documents
documents = generator.generate_all_documents()
print(f"\nโœ“ Generated {len(documents)} documents from CSV")
# Show sample document
if documents:
print(f"\n--- Sample Document ---")
print(f"Type: {documents[0].metadata.get('type', 'N/A')}")
print(f"Source: {documents[0].metadata.get('source', 'N/A')}")
print(f"\nContent Preview:")
print(documents[0].page_content[:400])
print("...")
return True
except Exception as e:
print(f"\nโŒ Error: {str(e)}")
import traceback
traceback.print_exc()
return False
def test_vector_store_integration():
"""Test vector store integration with CSV documents."""
print("\n" + "=" * 60)
print("TEST 2: Vector Store Integration")
print("=" * 60)
try:
from src.rag.document_loader import DocumentLoader
from src.rag.vector_store import VectorStore
from src.config.config import settings
document_loader = DocumentLoader()
# Load CSV insights
csv_path = settings.data_dir / "fraudTrain.csv"
print(f"\nโœ“ Loading CSV insights from {csv_path}")
csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
print(f"โœ“ Loaded {len(csv_documents)} CSV documents")
# Create vector store and add documents
print(f"\nโœ“ Creating vector store...")
vector_store = VectorStore()
vector_store.add_documents(csv_documents)
print(f"โœ“ Added {len(csv_documents)} documents to vector store")
# Test similarity search
print(f"\nโœ“ Testing similarity search...")
query = "What are fraud patterns in grocery stores?"
results = vector_store.similarity_search(query, k=3)
print(f"\nโœ“ Found {len(results)} relevant documents for query:")
print(f" '{query}'")
for i, doc in enumerate(results, 1):
print(f"\n--- Result {i} ---")
print(f"Type: {doc.metadata.get('type', 'N/A')}")
print(f"Category: {doc.metadata.get('category', 'N/A')}")
print(f"Content: {doc.page_content[:200]}...")
return True
except Exception as e:
print(f"\nโŒ Error: {str(e)}")
import traceback
traceback.print_exc()
return False
def test_full_rag_integration():
"""Test full RAG integration with both PDF and CSV."""
print("\n" + "=" * 60)
print("TEST 3: Full RAG Integration (PDF + CSV)")
print("=" * 60)
try:
from src.rag.document_loader import DocumentLoader
from src.rag.vector_store import VectorStore
from src.config.config import settings
document_loader = DocumentLoader(
chunk_size=settings.chunk_size,
chunk_overlap=settings.chunk_overlap,
)
all_documents = []
# Load PDF documents
print(f"\nโœ“ Loading PDF documents...")
pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir)
if pdf_documents:
all_documents.extend(pdf_documents)
print(f"โœ“ Loaded {len(pdf_documents)} PDF documents")
# Load CSV insights
print(f"\nโœ“ Loading CSV insights...")
csv_path = settings.data_dir / "fraudTrain.csv"
if csv_path.exists():
csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
all_documents.extend(csv_documents)
print(f"โœ“ Loaded {len(csv_documents)} CSV documents")
# Create vector store
print(f"\nโœ“ Creating unified vector store...")
vector_store = VectorStore()
vector_store.add_documents(all_documents)
print(f"โœ“ Total documents in RAG: {len(all_documents)}")
print(f" - PDF documents: {len(pdf_documents)}")
print(f" - CSV documents: {len(csv_documents)}")
# Test queries
test_queries = [
"What are common fraud patterns?",
"Fraud rate in grocery transactions",
"High risk merchants",
]
print(f"\nโœ“ Testing queries with unified RAG...")
for query in test_queries:
results = vector_store.similarity_search(query, k=2)
print(f"\nQuery: '{query}'")
print(f" Found {len(results)} results")
for doc in results:
doc_type = doc.metadata.get('type', 'pdf')
source = doc.metadata.get('source', 'N/A')
print(f" - Source: {source} (Type: {doc_type})")
return True
except Exception as e:
print(f"\nโŒ Error: {str(e)}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
print("\n" + "=" * 60)
print("CSV RAG INTEGRATION VERIFICATION")
print("=" * 60)
results = []
# Run tests
results.append(("CSV Document Generation", test_csv_document_generation()))
results.append(("Vector Store Integration", test_vector_store_integration()))
results.append(("Full RAG Integration", test_full_rag_integration()))
# Summary
print("\n" + "=" * 60)
print("TEST SUMMARY")
print("=" * 60)
for test_name, passed in results:
status = "โœ… PASSED" if passed else "โŒ FAILED"
print(f"{status} - {test_name}")
all_passed = all(result[1] for result in results)
if all_passed:
print("\n๐ŸŽ‰ All tests passed! CSV integration is working correctly.")
else:
print("\nโš ๏ธ Some tests failed. Please check the errors above.")
print("=" * 60)