Spaces:
Sleeping
Sleeping
File size: 6,951 Bytes
fd99b61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | """Test script to verify CSV integration into RAG system."""
import sys
from pathlib import Path
# Add parent directory to path to allow importing src modules
sys.path.insert(0, str(Path(__file__).parent.parent))
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def test_csv_document_generation():
"""Test CSV document generation."""
print("=" * 60)
print("TEST 1: CSV Document Generation")
print("=" * 60)
try:
from src.rag.csv_document_generator import CSVDocumentGenerator
csv_path = Path("data/fraudTrain.csv")
generator = CSVDocumentGenerator(csv_path, sample_size=1050000)
print(f"\nβ Created CSVDocumentGenerator")
print(f" CSV Path: {csv_path}")
print(f" Sample Size: 1,050,000 rows")
# Generate all documents
documents = generator.generate_all_documents()
print(f"\nβ Generated {len(documents)} documents from CSV")
# Show sample document
if documents:
print(f"\n--- Sample Document ---")
print(f"Type: {documents[0].metadata.get('type', 'N/A')}")
print(f"Source: {documents[0].metadata.get('source', 'N/A')}")
print(f"\nContent Preview:")
print(documents[0].page_content[:400])
print("...")
return True
except Exception as e:
print(f"\nβ Error: {str(e)}")
import traceback
traceback.print_exc()
return False
def test_vector_store_integration():
"""Test vector store integration with CSV documents."""
print("\n" + "=" * 60)
print("TEST 2: Vector Store Integration")
print("=" * 60)
try:
from src.rag.document_loader import DocumentLoader
from src.rag.vector_store import VectorStore
from src.config.config import settings
document_loader = DocumentLoader()
# Load CSV insights
csv_path = settings.data_dir / "fraudTrain.csv"
print(f"\nβ Loading CSV insights from {csv_path}")
csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
print(f"β Loaded {len(csv_documents)} CSV documents")
# Create vector store and add documents
print(f"\nβ Creating vector store...")
vector_store = VectorStore()
vector_store.add_documents(csv_documents)
print(f"β Added {len(csv_documents)} documents to vector store")
# Test similarity search
print(f"\nβ Testing similarity search...")
query = "What are fraud patterns in grocery stores?"
results = vector_store.similarity_search(query, k=3)
print(f"\nβ Found {len(results)} relevant documents for query:")
print(f" '{query}'")
for i, doc in enumerate(results, 1):
print(f"\n--- Result {i} ---")
print(f"Type: {doc.metadata.get('type', 'N/A')}")
print(f"Category: {doc.metadata.get('category', 'N/A')}")
print(f"Content: {doc.page_content[:200]}...")
return True
except Exception as e:
print(f"\nβ Error: {str(e)}")
import traceback
traceback.print_exc()
return False
def test_full_rag_integration():
"""Test full RAG integration with both PDF and CSV."""
print("\n" + "=" * 60)
print("TEST 3: Full RAG Integration (PDF + CSV)")
print("=" * 60)
try:
from src.rag.document_loader import DocumentLoader
from src.rag.vector_store import VectorStore
from src.config.config import settings
document_loader = DocumentLoader(
chunk_size=settings.chunk_size,
chunk_overlap=settings.chunk_overlap,
)
all_documents = []
# Load PDF documents
print(f"\nβ Loading PDF documents...")
pdf_documents = document_loader.load_pdfs_from_directory(settings.pdf_dir)
if pdf_documents:
all_documents.extend(pdf_documents)
print(f"β Loaded {len(pdf_documents)} PDF documents")
# Load CSV insights
print(f"\nβ Loading CSV insights...")
csv_path = settings.data_dir / "fraudTrain.csv"
if csv_path.exists():
csv_documents = document_loader.load_csv_insights(csv_path, sample_size=1050000)
all_documents.extend(csv_documents)
print(f"β Loaded {len(csv_documents)} CSV documents")
# Create vector store
print(f"\nβ Creating unified vector store...")
vector_store = VectorStore()
vector_store.add_documents(all_documents)
print(f"β Total documents in RAG: {len(all_documents)}")
print(f" - PDF documents: {len(pdf_documents)}")
print(f" - CSV documents: {len(csv_documents)}")
# Test queries
test_queries = [
"What are common fraud patterns?",
"Fraud rate in grocery transactions",
"High risk merchants",
]
print(f"\nβ Testing queries with unified RAG...")
for query in test_queries:
results = vector_store.similarity_search(query, k=2)
print(f"\nQuery: '{query}'")
print(f" Found {len(results)} results")
for doc in results:
doc_type = doc.metadata.get('type', 'pdf')
source = doc.metadata.get('source', 'N/A')
print(f" - Source: {source} (Type: {doc_type})")
return True
except Exception as e:
print(f"\nβ Error: {str(e)}")
import traceback
traceback.print_exc()
return False
if __name__ == "__main__":
print("\n" + "=" * 60)
print("CSV RAG INTEGRATION VERIFICATION")
print("=" * 60)
results = []
# Run tests
results.append(("CSV Document Generation", test_csv_document_generation()))
results.append(("Vector Store Integration", test_vector_store_integration()))
results.append(("Full RAG Integration", test_full_rag_integration()))
# Summary
print("\n" + "=" * 60)
print("TEST SUMMARY")
print("=" * 60)
for test_name, passed in results:
status = "β
PASSED" if passed else "β FAILED"
print(f"{status} - {test_name}")
all_passed = all(result[1] for result in results)
if all_passed:
print("\nπ All tests passed! CSV integration is working correctly.")
else:
print("\nβ οΈ Some tests failed. Please check the errors above.")
print("=" * 60)
|