GitHub Actions
Clean sync from GitHub - no large files in history
aca8ab4
"""
Test script to verify arxiv v2.2.0 PDF URL fix for paper 2102.08370v2.
"""
import os
import sys
import logging
from pathlib import Path
# Add project root to path
sys.path.insert(0, str(Path(__file__).parent))
from utils.arxiv_client import ArxivClient, _extract_pdf_url
from utils.fastmcp_arxiv_server import _extract_pdf_url as fastmcp_extract_pdf_url
import arxiv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def test_extract_pdf_url():
"""Test the _extract_pdf_url helper function directly."""
print("\n" + "="*80)
print("TEST 1: Direct PDF URL extraction from arxiv.Result")
print("="*80)
# Test with the problematic paper ID
paper_id = "2102.08370v2"
logger.info(f"Testing PDF URL extraction for paper: {paper_id}")
# Fetch the paper using arxiv library
search = arxiv.Search(id_list=[paper_id])
result = next(search.results())
# Show what arxiv library returns
print(f"\nPaper ID: {result.entry_id.split('/')[-1]}")
print(f"Title: {result.title[:80]}...")
print(f"result.pdf_url (deprecated): {result.pdf_url}")
print(f"\nLinks from result.links:")
for i, link in enumerate(result.links):
print(f" [{i}] {link.href} ({link.rel})")
# Test extraction from both clients
url_arxiv = _extract_pdf_url(result)
url_fastmcp = fastmcp_extract_pdf_url(result)
print(f"\nExtracted PDF URL (ArxivClient): {url_arxiv}")
print(f"Extracted PDF URL (FastMCP): {url_fastmcp}")
assert url_arxiv is not None, "ArxivClient helper failed to extract PDF URL"
assert url_fastmcp is not None, "FastMCP helper failed to extract PDF URL"
assert "pdf" in url_arxiv.lower(), "Extracted URL doesn't contain 'pdf'"
assert url_arxiv == url_fastmcp, "Both helpers should return same URL"
print("\n✓ PDF URL extraction test PASSED")
return url_arxiv
def test_arxiv_client_search():
"""Test ArxivClient.search_papers() with the fixed code."""
print("\n" + "="*80)
print("TEST 2: ArxivClient.search_papers() integration")
print("="*80)
client = ArxivClient(cache_dir="data/test_papers")
# Search for a specific paper
papers = client.search_papers(
query="ti:Attention Is All You Need",
max_results=1
)
assert len(papers) > 0, "No papers found"
paper = papers[0]
print(f"\nFound paper:")
print(f" ID: {paper.arxiv_id}")
print(f" Title: {paper.title[:80]}...")
print(f" PDF URL: {paper.pdf_url}")
assert paper.pdf_url is not None, "Paper pdf_url is None"
assert "pdf" in paper.pdf_url.lower(), "PDF URL doesn't contain 'pdf'"
print("\n✓ ArxivClient search test PASSED")
return paper
def test_fastmcp_download_logic():
"""Test the download_paper logic that was failing."""
print("\n" + "="*80)
print("TEST 3: FastMCP download_paper URL extraction")
print("="*80)
paper_id = "2102.08370v2"
# Simulate the download_paper logic
search = arxiv.Search(id_list=[paper_id])
result = next(search.results())
# This is what was failing: result.pdf_url was None
print(f"\nOld approach (BROKEN):")
print(f" result.pdf_url = {result.pdf_url}")
# New approach with helper
pdf_url = fastmcp_extract_pdf_url(result)
print(f"\nNew approach (FIXED):")
print(f" _extract_pdf_url(result) = {pdf_url}")
assert pdf_url is not None, "Failed to extract PDF URL"
assert paper_id in pdf_url, f"PDF URL doesn't contain paper ID {paper_id}"
print("\n✓ FastMCP download logic test PASSED")
return pdf_url
def main():
"""Run all tests."""
print("\n" + "="*80)
print("ARXIV v2.2.0 PDF URL FIX - VERIFICATION TESTS")
print("="*80)
try:
# Test 1: Direct extraction
pdf_url = test_extract_pdf_url()
# Test 2: ArxivClient integration
paper = test_arxiv_client_search()
# Test 3: FastMCP download logic
fastmcp_url = test_fastmcp_download_logic()
print("\n" + "="*80)
print("ALL TESTS PASSED ✓")
print("="*80)
print(f"\nThe fix successfully resolves the 'unknown url type: None' error")
print(f"for paper 2102.08370v2 and all other papers.")
print(f"\nKey changes:")
print(f" 1. Added _extract_pdf_url() helper to both clients")
print(f" 2. Extracts PDF URL from result.links (arxiv v2.2.0+)")
print(f" 3. Falls back to URL construction if needed")
print(f" 4. Validates URL exists before use")
return 0
except AssertionError as e:
print(f"\n✗ TEST FAILED: {e}")
return 1
except Exception as e:
print(f"\n✗ ERROR: {e}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
sys.exit(main())