File size: 1,749 Bytes
3d070f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pytest

from src.tools.europepmc import EuropePMCTool
from src.tools.openalex import OpenAlexTool
from src.tools.pubmed import PubMedTool
from src.tools.search_handler import SearchHandler, extract_paper_id


@pytest.mark.integration
@pytest.mark.slow
async def test_real_search_deduplicates() -> None:
    """Integration test: Real search should deduplicate PubMed/Europe PMC."""

    # Initialize tools
    # Note: PubMedTool handles missing API key gracefully (lower rate limit)
    handler = SearchHandler(
        tools=[PubMedTool(), EuropePMCTool(), OpenAlexTool()],
        timeout=30.0,
    )

    # Execute search
    # "sildenafil erectile dysfunction" is a well-indexed topic likely to appear in all sources
    result = await handler.execute("sildenafil erectile dysfunction", max_results_per_tool=5)

    # Checks
    # 1. Total results should be less than sum of max_results (5 * 3 = 15) if deduplication works
    # (There's a high chance of overlap between PubMed, EuropePMC, and OpenAlex)
    assert result.total_found > 0, "Search should return some results"

    # Note: We can't strictly assert result.total_found < 15 because it's theoretically possible
    # (though unlikely) to get 15 unique papers. But for this query, overlap is expected.
    # A better check is to verify uniqueness explicitly.

    # 2. Verify no duplicate IDs in the returned evidence
    # extract_paper_id filter already excludes falsy values (including None)
    paper_ids = [extract_paper_id(e) for e in result.evidence if extract_paper_id(e)]

    # Check for duplicates
    unique_ids = set(paper_ids)
    assert len(paper_ids) == len(unique_ids), (
        f"Duplicate IDs found: {[x for x in paper_ids if paper_ids.count(x) > 1]}"
    )