File size: 6,639 Bytes
aca8ab4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
"""
Test data validation fixes for MCP paper parsing and PDF processing.
This test verifies that malformed data (dicts instead of lists) is handled correctly.
"""
import sys
from datetime import datetime
from utils.schemas import Paper
from utils.pdf_processor import PDFProcessor
def test_paper_schema_validators():
"""Test that Paper schema validators correctly normalize malformed data."""
print("\n" + "="*80)
print("TEST 1: Paper Schema Validators")
print("="*80)
# Test 1: Authors as dict (malformed)
print("\n1. Testing authors as dict (malformed data)...")
try:
paper = Paper(
arxiv_id="test.001",
title="Test Paper",
authors={"author1": "John Doe", "author2": "Jane Smith"}, # Dict instead of list!
abstract="Test abstract",
pdf_url="https://arxiv.org/pdf/test.001.pdf",
published=datetime.now(),
categories=["cs.AI"]
)
print(f" β Paper created successfully")
print(f" Authors type: {type(paper.authors)}")
print(f" Authors value: {paper.authors}")
assert isinstance(paper.authors, list), "Authors should be normalized to list"
print(f" β Authors correctly normalized to list")
except Exception as e:
print(f" β Failed: {str(e)}")
return False
# Test 2: Categories as dict (malformed)
print("\n2. Testing categories as dict (malformed data)...")
try:
paper = Paper(
arxiv_id="test.002",
title="Test Paper 2",
authors=["John Doe"],
abstract="Test abstract",
pdf_url="https://arxiv.org/pdf/test.002.pdf",
published=datetime.now(),
categories={"cat1": "cs.AI", "cat2": "cs.LG"} # Dict instead of list!
)
print(f" β Paper created successfully")
print(f" Categories type: {type(paper.categories)}")
print(f" Categories value: {paper.categories}")
assert isinstance(paper.categories, list), "Categories should be normalized to list"
print(f" β Categories correctly normalized to list")
except Exception as e:
print(f" β Failed: {str(e)}")
return False
# Test 3: Multiple fields malformed
print("\n3. Testing multiple fields malformed...")
try:
paper = Paper(
arxiv_id="test.003",
title={"title": "Test Paper 3"}, # Dict!
authors={"names": ["John Doe", "Jane Smith"]}, # Dict with nested list!
abstract={"summary": "Test abstract"}, # Dict!
pdf_url={"url": "https://arxiv.org/pdf/test.003.pdf"}, # Dict!
published=datetime.now(),
categories={"categories": ["cs.AI"]} # Dict with nested list!
)
print(f" β Paper created successfully")
print(f" Title type: {type(paper.title)}, value: {paper.title}")
print(f" Authors type: {type(paper.authors)}, value: {paper.authors}")
print(f" Abstract type: {type(paper.abstract)}, value: {paper.abstract[:50]}...")
print(f" PDF URL type: {type(paper.pdf_url)}, value: {paper.pdf_url}")
print(f" Categories type: {type(paper.categories)}, value: {paper.categories}")
assert isinstance(paper.title, str), "Title should be normalized to string"
assert isinstance(paper.authors, list), "Authors should be normalized to list"
assert isinstance(paper.abstract, str), "Abstract should be normalized to string"
assert isinstance(paper.pdf_url, str), "PDF URL should be normalized to string"
assert isinstance(paper.categories, list), "Categories should be normalized to list"
print(f" β All fields correctly normalized")
except Exception as e:
print(f" β Failed: {str(e)}")
return False
print("\n" + "="*80)
print("β ALL PAPER SCHEMA VALIDATION TESTS PASSED")
print("="*80)
return True
def test_pdf_processor_resilience():
"""Test that PDFProcessor handles malformed Paper objects gracefully."""
print("\n" + "="*80)
print("TEST 2: PDFProcessor Resilience")
print("="*80)
processor = PDFProcessor(chunk_size=100, chunk_overlap=10)
# Create a paper with properly validated data
print("\n1. Testing PDF processor with validated Paper object...")
try:
paper = Paper(
arxiv_id="test.004",
title="Test Paper",
authors={"author1": "John Doe"}, # Will be normalized by validators
abstract="Test abstract",
pdf_url="https://arxiv.org/pdf/test.004.pdf",
published=datetime.now(),
categories=["cs.AI"]
)
# Create a simple test text
test_text = "This is a test document. " * 100
chunks = processor.chunk_text(test_text, paper)
print(f" β Created {len(chunks)} chunks successfully")
print(f" First chunk metadata authors type: {type(chunks[0].metadata['authors'])}")
print(f" First chunk metadata authors: {chunks[0].metadata['authors']}")
assert isinstance(chunks[0].metadata['authors'], list), "Chunk metadata authors should be list"
print(f" β Chunk metadata correctly contains list for authors")
except Exception as e:
print(f" β Failed: {str(e)}")
import traceback
traceback.print_exc()
return False
print("\n" + "="*80)
print("β PDF PROCESSOR RESILIENCE TESTS PASSED")
print("="*80)
return True
if __name__ == "__main__":
print("\n" + "="*80)
print("DATA VALIDATION FIX VERIFICATION TESTS")
print("="*80)
print("\nThese tests verify that the fixes for malformed MCP data work correctly:")
print("- Paper schema validators normalize dict fields to proper types")
print("- PDF processor handles validated Paper objects without errors")
print("="*80)
test1_pass = test_paper_schema_validators()
test2_pass = test_pdf_processor_resilience()
print("\n" + "="*80)
print("FINAL RESULTS")
print("="*80)
print(f"Paper Schema Validators: {'β PASS' if test1_pass else 'β FAIL'}")
print(f"PDF Processor Resilience: {'β PASS' if test2_pass else 'β FAIL'}")
print("="*80)
if test1_pass and test2_pass:
print("\nβ ALL TESTS PASSED - The data validation fixes are working correctly!")
print("\nThe system should now handle malformed MCP responses gracefully.")
sys.exit(0)
else:
print("\nβ SOME TESTS FAILED - Please review the errors above")
sys.exit(1)
|