File size: 6,639 Bytes
aca8ab4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
Test data validation fixes for MCP paper parsing and PDF processing.
This test verifies that malformed data (dicts instead of lists) is handled correctly.
"""
import sys
from datetime import datetime
from utils.schemas import Paper
from utils.pdf_processor import PDFProcessor


def test_paper_schema_validators():
    """Test that Paper schema validators correctly normalize malformed data."""
    print("\n" + "="*80)
    print("TEST 1: Paper Schema Validators")
    print("="*80)

    # Test 1: Authors as dict (malformed)
    print("\n1. Testing authors as dict (malformed data)...")
    try:
        paper = Paper(
            arxiv_id="test.001",
            title="Test Paper",
            authors={"author1": "John Doe", "author2": "Jane Smith"},  # Dict instead of list!
            abstract="Test abstract",
            pdf_url="https://arxiv.org/pdf/test.001.pdf",
            published=datetime.now(),
            categories=["cs.AI"]
        )
        print(f"   βœ“ Paper created successfully")
        print(f"   Authors type: {type(paper.authors)}")
        print(f"   Authors value: {paper.authors}")
        assert isinstance(paper.authors, list), "Authors should be normalized to list"
        print(f"   βœ“ Authors correctly normalized to list")
    except Exception as e:
        print(f"   βœ— Failed: {str(e)}")
        return False

    # Test 2: Categories as dict (malformed)
    print("\n2. Testing categories as dict (malformed data)...")
    try:
        paper = Paper(
            arxiv_id="test.002",
            title="Test Paper 2",
            authors=["John Doe"],
            abstract="Test abstract",
            pdf_url="https://arxiv.org/pdf/test.002.pdf",
            published=datetime.now(),
            categories={"cat1": "cs.AI", "cat2": "cs.LG"}  # Dict instead of list!
        )
        print(f"   βœ“ Paper created successfully")
        print(f"   Categories type: {type(paper.categories)}")
        print(f"   Categories value: {paper.categories}")
        assert isinstance(paper.categories, list), "Categories should be normalized to list"
        print(f"   βœ“ Categories correctly normalized to list")
    except Exception as e:
        print(f"   βœ— Failed: {str(e)}")
        return False

    # Test 3: Multiple fields malformed
    print("\n3. Testing multiple fields malformed...")
    try:
        paper = Paper(
            arxiv_id="test.003",
            title={"title": "Test Paper 3"},  # Dict!
            authors={"names": ["John Doe", "Jane Smith"]},  # Dict with nested list!
            abstract={"summary": "Test abstract"},  # Dict!
            pdf_url={"url": "https://arxiv.org/pdf/test.003.pdf"},  # Dict!
            published=datetime.now(),
            categories={"categories": ["cs.AI"]}  # Dict with nested list!
        )
        print(f"   βœ“ Paper created successfully")
        print(f"   Title type: {type(paper.title)}, value: {paper.title}")
        print(f"   Authors type: {type(paper.authors)}, value: {paper.authors}")
        print(f"   Abstract type: {type(paper.abstract)}, value: {paper.abstract[:50]}...")
        print(f"   PDF URL type: {type(paper.pdf_url)}, value: {paper.pdf_url}")
        print(f"   Categories type: {type(paper.categories)}, value: {paper.categories}")

        assert isinstance(paper.title, str), "Title should be normalized to string"
        assert isinstance(paper.authors, list), "Authors should be normalized to list"
        assert isinstance(paper.abstract, str), "Abstract should be normalized to string"
        assert isinstance(paper.pdf_url, str), "PDF URL should be normalized to string"
        assert isinstance(paper.categories, list), "Categories should be normalized to list"
        print(f"   βœ“ All fields correctly normalized")
    except Exception as e:
        print(f"   βœ— Failed: {str(e)}")
        return False

    print("\n" + "="*80)
    print("βœ“ ALL PAPER SCHEMA VALIDATION TESTS PASSED")
    print("="*80)
    return True


def test_pdf_processor_resilience():
    """Test that PDFProcessor handles malformed Paper objects gracefully."""
    print("\n" + "="*80)
    print("TEST 2: PDFProcessor Resilience")
    print("="*80)

    processor = PDFProcessor(chunk_size=100, chunk_overlap=10)

    # Create a paper with properly validated data
    print("\n1. Testing PDF processor with validated Paper object...")
    try:
        paper = Paper(
            arxiv_id="test.004",
            title="Test Paper",
            authors={"author1": "John Doe"},  # Will be normalized by validators
            abstract="Test abstract",
            pdf_url="https://arxiv.org/pdf/test.004.pdf",
            published=datetime.now(),
            categories=["cs.AI"]
        )

        # Create a simple test text
        test_text = "This is a test document. " * 100

        chunks = processor.chunk_text(test_text, paper)
        print(f"   βœ“ Created {len(chunks)} chunks successfully")
        print(f"   First chunk metadata authors type: {type(chunks[0].metadata['authors'])}")
        print(f"   First chunk metadata authors: {chunks[0].metadata['authors']}")

        assert isinstance(chunks[0].metadata['authors'], list), "Chunk metadata authors should be list"
        print(f"   βœ“ Chunk metadata correctly contains list for authors")

    except Exception as e:
        print(f"   βœ— Failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

    print("\n" + "="*80)
    print("βœ“ PDF PROCESSOR RESILIENCE TESTS PASSED")
    print("="*80)
    return True


if __name__ == "__main__":
    print("\n" + "="*80)
    print("DATA VALIDATION FIX VERIFICATION TESTS")
    print("="*80)
    print("\nThese tests verify that the fixes for malformed MCP data work correctly:")
    print("- Paper schema validators normalize dict fields to proper types")
    print("- PDF processor handles validated Paper objects without errors")
    print("="*80)

    test1_pass = test_paper_schema_validators()
    test2_pass = test_pdf_processor_resilience()

    print("\n" + "="*80)
    print("FINAL RESULTS")
    print("="*80)
    print(f"Paper Schema Validators: {'βœ“ PASS' if test1_pass else 'βœ— FAIL'}")
    print(f"PDF Processor Resilience: {'βœ“ PASS' if test2_pass else 'βœ— FAIL'}")
    print("="*80)

    if test1_pass and test2_pass:
        print("\nβœ“ ALL TESTS PASSED - The data validation fixes are working correctly!")
        print("\nThe system should now handle malformed MCP responses gracefully.")
        sys.exit(0)
    else:
        print("\nβœ— SOME TESTS FAILED - Please review the errors above")
        sys.exit(1)