Spaces:

samir72
/

Multi-Agent-Research-Paper-Analysis-System

Running

Multi-Agent-Research-Paper-Analysis-System / tests /test_arxiv_v2_fix.py

GitHub Actions

Clean sync from GitHub - no large files in history

aca8ab4 2 months ago

4.84 kB

	"""
	Test script to verify arxiv v2.2.0 PDF URL fix for paper 2102.08370v2.
	"""
	import os
	import sys
	import logging
	from pathlib import Path

	# Add project root to path
	sys.path.insert(0, str(Path(__file__).parent))

	from utils.arxiv_client import ArxivClient, _extract_pdf_url
	from utils.fastmcp_arxiv_server import _extract_pdf_url as fastmcp_extract_pdf_url
	import arxiv

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	def test_extract_pdf_url():
	"""Test the _extract_pdf_url helper function directly."""
	print("\n" + "="*80)
	print("TEST 1: Direct PDF URL extraction from arxiv.Result")
	print("="*80)

	# Test with the problematic paper ID
	paper_id = "2102.08370v2"
	logger.info(f"Testing PDF URL extraction for paper: {paper_id}")

	# Fetch the paper using arxiv library
	search = arxiv.Search(id_list=[paper_id])
	result = next(search.results())

	# Show what arxiv library returns
	print(f"\nPaper ID: {result.entry_id.split('/')[-1]}")
	print(f"Title: {result.title[:80]}...")
	print(f"result.pdf_url (deprecated): {result.pdf_url}")
	print(f"\nLinks from result.links:")
	for i, link in enumerate(result.links):
	print(f" [{i}] {link.href} ({link.rel})")

	# Test extraction from both clients
	url_arxiv = _extract_pdf_url(result)
	url_fastmcp = fastmcp_extract_pdf_url(result)

	print(f"\nExtracted PDF URL (ArxivClient): {url_arxiv}")
	print(f"Extracted PDF URL (FastMCP): {url_fastmcp}")

	assert url_arxiv is not None, "ArxivClient helper failed to extract PDF URL"
	assert url_fastmcp is not None, "FastMCP helper failed to extract PDF URL"
	assert "pdf" in url_arxiv.lower(), "Extracted URL doesn't contain 'pdf'"
	assert url_arxiv == url_fastmcp, "Both helpers should return same URL"

	print("\n✓ PDF URL extraction test PASSED")
	return url_arxiv


	def test_arxiv_client_search():
	"""Test ArxivClient.search_papers() with the fixed code."""
	print("\n" + "="*80)
	print("TEST 2: ArxivClient.search_papers() integration")
	print("="*80)

	client = ArxivClient(cache_dir="data/test_papers")

	# Search for a specific paper
	papers = client.search_papers(
	query="ti:Attention Is All You Need",
	max_results=1
	)

	assert len(papers) > 0, "No papers found"
	paper = papers[0]

	print(f"\nFound paper:")
	print(f" ID: {paper.arxiv_id}")
	print(f" Title: {paper.title[:80]}...")
	print(f" PDF URL: {paper.pdf_url}")

	assert paper.pdf_url is not None, "Paper pdf_url is None"
	assert "pdf" in paper.pdf_url.lower(), "PDF URL doesn't contain 'pdf'"

	print("\n✓ ArxivClient search test PASSED")
	return paper


	def test_fastmcp_download_logic():
	"""Test the download_paper logic that was failing."""
	print("\n" + "="*80)
	print("TEST 3: FastMCP download_paper URL extraction")
	print("="*80)

	paper_id = "2102.08370v2"

	# Simulate the download_paper logic
	search = arxiv.Search(id_list=[paper_id])
	result = next(search.results())

	# This is what was failing: result.pdf_url was None
	print(f"\nOld approach (BROKEN):")
	print(f" result.pdf_url = {result.pdf_url}")

	# New approach with helper
	pdf_url = fastmcp_extract_pdf_url(result)
	print(f"\nNew approach (FIXED):")
	print(f" _extract_pdf_url(result) = {pdf_url}")

	assert pdf_url is not None, "Failed to extract PDF URL"
	assert paper_id in pdf_url, f"PDF URL doesn't contain paper ID {paper_id}"

	print("\n✓ FastMCP download logic test PASSED")
	return pdf_url


	def main():
	"""Run all tests."""
	print("\n" + "="*80)
	print("ARXIV v2.2.0 PDF URL FIX - VERIFICATION TESTS")
	print("="*80)

	try:
	# Test 1: Direct extraction
	pdf_url = test_extract_pdf_url()

	# Test 2: ArxivClient integration
	paper = test_arxiv_client_search()

	# Test 3: FastMCP download logic
	fastmcp_url = test_fastmcp_download_logic()

	print("\n" + "="*80)
	print("ALL TESTS PASSED ✓")
	print("="*80)
	print(f"\nThe fix successfully resolves the 'unknown url type: None' error")
	print(f"for paper 2102.08370v2 and all other papers.")
	print(f"\nKey changes:")
	print(f" 1. Added _extract_pdf_url() helper to both clients")
	print(f" 2. Extracts PDF URL from result.links (arxiv v2.2.0+)")
	print(f" 3. Falls back to URL construction if needed")
	print(f" 4. Validates URL exists before use")

	return 0

	except AssertionError as e:
	print(f"\n✗ TEST FAILED: {e}")
	return 1
	except Exception as e:
	print(f"\n✗ ERROR: {e}")
	import traceback
	traceback.print_exc()
	return 1


	if __name__ == "__main__":
	sys.exit(main())