| | """ |
| | Test script to verify arxiv v2.2.0 PDF URL fix for paper 2102.08370v2. |
| | """ |
| | import os |
| | import sys |
| | import logging |
| | from pathlib import Path |
| |
|
| | |
| | sys.path.insert(0, str(Path(__file__).parent)) |
| |
|
| | from utils.arxiv_client import ArxivClient, _extract_pdf_url |
| | from utils.fastmcp_arxiv_server import _extract_pdf_url as fastmcp_extract_pdf_url |
| | import arxiv |
| |
|
| | logging.basicConfig(level=logging.INFO) |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def test_extract_pdf_url(): |
| | """Test the _extract_pdf_url helper function directly.""" |
| | print("\n" + "="*80) |
| | print("TEST 1: Direct PDF URL extraction from arxiv.Result") |
| | print("="*80) |
| |
|
| | |
| | paper_id = "2102.08370v2" |
| | logger.info(f"Testing PDF URL extraction for paper: {paper_id}") |
| |
|
| | |
| | search = arxiv.Search(id_list=[paper_id]) |
| | result = next(search.results()) |
| |
|
| | |
| | print(f"\nPaper ID: {result.entry_id.split('/')[-1]}") |
| | print(f"Title: {result.title[:80]}...") |
| | print(f"result.pdf_url (deprecated): {result.pdf_url}") |
| | print(f"\nLinks from result.links:") |
| | for i, link in enumerate(result.links): |
| | print(f" [{i}] {link.href} ({link.rel})") |
| |
|
| | |
| | url_arxiv = _extract_pdf_url(result) |
| | url_fastmcp = fastmcp_extract_pdf_url(result) |
| |
|
| | print(f"\nExtracted PDF URL (ArxivClient): {url_arxiv}") |
| | print(f"Extracted PDF URL (FastMCP): {url_fastmcp}") |
| |
|
| | assert url_arxiv is not None, "ArxivClient helper failed to extract PDF URL" |
| | assert url_fastmcp is not None, "FastMCP helper failed to extract PDF URL" |
| | assert "pdf" in url_arxiv.lower(), "Extracted URL doesn't contain 'pdf'" |
| | assert url_arxiv == url_fastmcp, "Both helpers should return same URL" |
| |
|
| | print("\n✓ PDF URL extraction test PASSED") |
| | return url_arxiv |
| |
|
| |
|
| | def test_arxiv_client_search(): |
| | """Test ArxivClient.search_papers() with the fixed code.""" |
| | print("\n" + "="*80) |
| | print("TEST 2: ArxivClient.search_papers() integration") |
| | print("="*80) |
| |
|
| | client = ArxivClient(cache_dir="data/test_papers") |
| |
|
| | |
| | papers = client.search_papers( |
| | query="ti:Attention Is All You Need", |
| | max_results=1 |
| | ) |
| |
|
| | assert len(papers) > 0, "No papers found" |
| | paper = papers[0] |
| |
|
| | print(f"\nFound paper:") |
| | print(f" ID: {paper.arxiv_id}") |
| | print(f" Title: {paper.title[:80]}...") |
| | print(f" PDF URL: {paper.pdf_url}") |
| |
|
| | assert paper.pdf_url is not None, "Paper pdf_url is None" |
| | assert "pdf" in paper.pdf_url.lower(), "PDF URL doesn't contain 'pdf'" |
| |
|
| | print("\n✓ ArxivClient search test PASSED") |
| | return paper |
| |
|
| |
|
| | def test_fastmcp_download_logic(): |
| | """Test the download_paper logic that was failing.""" |
| | print("\n" + "="*80) |
| | print("TEST 3: FastMCP download_paper URL extraction") |
| | print("="*80) |
| |
|
| | paper_id = "2102.08370v2" |
| |
|
| | |
| | search = arxiv.Search(id_list=[paper_id]) |
| | result = next(search.results()) |
| |
|
| | |
| | print(f"\nOld approach (BROKEN):") |
| | print(f" result.pdf_url = {result.pdf_url}") |
| |
|
| | |
| | pdf_url = fastmcp_extract_pdf_url(result) |
| | print(f"\nNew approach (FIXED):") |
| | print(f" _extract_pdf_url(result) = {pdf_url}") |
| |
|
| | assert pdf_url is not None, "Failed to extract PDF URL" |
| | assert paper_id in pdf_url, f"PDF URL doesn't contain paper ID {paper_id}" |
| |
|
| | print("\n✓ FastMCP download logic test PASSED") |
| | return pdf_url |
| |
|
| |
|
| | def main(): |
| | """Run all tests.""" |
| | print("\n" + "="*80) |
| | print("ARXIV v2.2.0 PDF URL FIX - VERIFICATION TESTS") |
| | print("="*80) |
| |
|
| | try: |
| | |
| | pdf_url = test_extract_pdf_url() |
| |
|
| | |
| | paper = test_arxiv_client_search() |
| |
|
| | |
| | fastmcp_url = test_fastmcp_download_logic() |
| |
|
| | print("\n" + "="*80) |
| | print("ALL TESTS PASSED ✓") |
| | print("="*80) |
| | print(f"\nThe fix successfully resolves the 'unknown url type: None' error") |
| | print(f"for paper 2102.08370v2 and all other papers.") |
| | print(f"\nKey changes:") |
| | print(f" 1. Added _extract_pdf_url() helper to both clients") |
| | print(f" 2. Extracts PDF URL from result.links (arxiv v2.2.0+)") |
| | print(f" 3. Falls back to URL construction if needed") |
| | print(f" 4. Validates URL exists before use") |
| |
|
| | return 0 |
| |
|
| | except AssertionError as e: |
| | print(f"\n✗ TEST FAILED: {e}") |
| | return 1 |
| | except Exception as e: |
| | print(f"\n✗ ERROR: {e}") |
| | import traceback |
| | traceback.print_exc() |
| | return 1 |
| |
|
| |
|
| | if __name__ == "__main__": |
| | sys.exit(main()) |
| |
|