roamify / tests /test_recommender.py
jofaichow's picture
fix: prefer Wikimedia images over Pixabay stock photos on re-check
d24bd0d
"""Tests for the recommender module."""
import json
from unittest.mock import MagicMock, patch
import pytest
# Import the units under test from the recommender module
from src.services.recommender import (
_enrich_one_item,
_fetch_wiki_image,
_haversine_km,
_is_media_entertainment_page,
_parse_json_response,
name_key,
)
# ──────────────────────────────────────────────────────────────────────
# 1. _parse_json_response
# ──────────────────────────────────────────────────────────────────────
class TestParseJsonResponse:
"""Tests for _parse_json_response — robust LLM JSON extraction."""
def test_valid_json_array(self):
"""Parses a standard JSON array."""
result = _parse_json_response('[{"name": "Eiffel Tower"}, {"name": "Louvre"}]')
assert result == [{"name": "Eiffel Tower"}, {"name": "Louvre"}]
def test_valid_json_object_wraps_to_list(self):
"""A bare JSON object is wrapped into a single-element list."""
result = _parse_json_response('{"name": "Sagrada Familia", "city": "Barcelona"}')
assert result == [{"name": "Sagrada Familia", "city": "Barcelona"}]
def test_json_in_markdown_fences(self):
"""Strips ```json ... ``` fences before parsing."""
raw = '```json\n[{"name": "Colosseum"}]\n```'
result = _parse_json_response(raw)
assert result == [{"name": "Colosseum"}]
def test_truncated_json_last_object_missing_brace(self):
"""Truncated JSON without closing bracket returns None — parser requires a ']'.
The function only attempts bracket-delimited recovery when both [ and ] are
present. A truly truncated stream with no closing ] is unresolvable.
"""
raw = '[{"name": "Eiffel"}, {"name": "Louvre"'
result = _parse_json_response(raw)
assert result is None
def test_empty_string_returns_none(self):
"""Empty string yields None."""
assert _parse_json_response("") is None
def test_whitespace_only_returns_none(self):
"""Whitespace-only string yields None."""
assert _parse_json_response(" \n\t ") is None
def test_garbage_string_returns_none(self):
"""Non-JSON garbage returns None."""
assert _parse_json_response("not json at all") is None
def test_none_like_text_returns_none(self):
"""Random text that happens to have braces but is not valid JSON."""
result = _parse_json_response("just some random { text } with [ brackets")
assert result is None
# ──────────────────────────────────────────────────────────────────────
# 2. _haversine_km
# ──────────────────────────────────────────────────────────────────────
class TestHaversineKm:
"""Tests for haversine distance calculation."""
def test_london_to_paris(self):
"""London → Paris is approximately 344 km."""
# London: 51.5074° N, 0.1278° W
# Paris: 48.8566° N, 2.3522° E
dist = _haversine_km(51.5074, -0.1278, 48.8566, 2.3522)
assert 330 <= dist <= 360, f"Expected ~344 km, got {dist:.1f}"
def test_tokyo_to_osaka(self):
"""Tokyo → Osaka is approximately 403 km."""
# Tokyo: 35.6762° N, 139.6503° E
# Osaka: 34.6937° N, 135.5023° E
dist = _haversine_km(35.6762, 139.6503, 34.6937, 135.5023)
assert 390 <= dist <= 420, f"Expected ~403 km, got {dist:.1f}"
def test_same_point_zero_distance(self):
"""Distance from a point to itself is zero."""
dist = _haversine_km(48.8566, 2.3522, 48.8566, 2.3522)
assert dist == pytest.approx(0.0, abs=0.001)
def test_symmetric(self):
"""Haversine is symmetric (A→B equals B→A)."""
a = (40.7128, -74.0060) # NYC
b = (34.0522, -118.2437) # LA
assert _haversine_km(*a, *b) == pytest.approx(_haversine_km(*b, *a))
# ──────────────────────────────────────────────────────────────────────
# 3. _is_media_entertainment_page
# ──────────────────────────────────────────────────────────────────────
class TestIsMediaEntertainmentPage:
"""Tests for detecting non-tourist media/entertainment Wikipedia pages."""
# The function takes (title, extract). All test cases below focus on
# title-based detection (disambiguation patterns) with empty extracts.
# The extract-based detection is tested separately.
def test_star_wars_film(self):
"""Film disambiguation in title → True."""
assert _is_media_entertainment_page("Star Wars (film)", "") is True
def test_dark_knight_movie(self):
"""Movie disambiguation in title → True."""
assert _is_media_entertainment_page("The Dark Knight (movie)", "") is True
def test_friends_tv_series(self):
"""TV series disambiguation in title → True."""
assert _is_media_entertainment_page("Friends (TV series)", "") is True
def test_short_film_media(self):
"""'short film' disambiguation pattern in title → True."""
assert _is_media_entertainment_page("Colosseum (short film)", "") is True
def test_eiffel_tower_not_media(self):
"""Real landmark with no media indicator → False."""
assert _is_media_entertainment_page("Eiffel Tower", "") is False
def test_central_park_not_media(self):
"""Park with no media indicator → False."""
assert _is_media_entertainment_page("Central Park", "") is False
def test_extract_based_detection(self):
"""Title is clean but extract reveals it's a film."""
assert _is_media_entertainment_page(
"Inception",
"Inception is a 2010 science fiction film directed by Christopher Nolan...",
) is True
def test_extract_tv_series_detection(self):
"""Extract reveals TV series."""
assert _is_media_entertainment_page(
"Breaking Bad",
"Breaking Bad is an American television series created by Vince Gilligan...",
) is True
def test_clean_title_and_extract(self):
"""Both title and extract are about a real place → False."""
assert _is_media_entertainment_page(
"Machu Picchu",
"Machu Picchu is a 15th-century Inca citadel situated on a mountain ridge...",
) is False
# ──────────────────────────────────────────────────────────────────────
# 4. name_key
# ──────────────────────────────────────────────────────────────────────
class TestNameKey:
"""Tests for attraction name normalization for deduplication."""
def test_eiffel_tower(self):
"""Simple name: 'Tower' is an attraction suffix so it gets stripped."""
# " tower" is in _ATTRACTION_SUFFIXES, so it's removed from the key
assert name_key({"name": "Eiffel Tower"}) == "eiffel"
def test_louvre_museum(self):
"""Museum suffix is stripped from the normalized key."""
assert name_key({"name": "Louvre Museum"}) == "louvre"
def test_notre_dame_with_parenthetical_and_church_suffix(self):
"""Parenthetical removed, ' Church' suffix removed, non-alphanumeric stripped."""
# ' church' IS in _ATTRACTION_SUFFIXES (unlike ' cathedral')
result = name_key({"name": "Notre-Dame Church (Paris)"})
assert result == "notredame"
def test_empty_name(self):
"""Empty name returns empty string."""
assert name_key({"name": ""}) == ""
def test_missing_name_key(self):
"""Item dict with no 'name' key returns empty string."""
assert name_key({}) == ""
def test_same_name_with_different_punctuation(self):
"""Punctuation differences produce the same key."""
key1 = name_key({"name": "St. Peter's Basilica"})
key2 = name_key({"name": "St Peters Basilica"})
assert key1 == key2
def test_same_name_different_casing(self):
"""Case differences produce the same key."""
key1 = name_key({"name": "COLOSSEUM"})
key2 = name_key({"name": "colosseum"})
assert key1 == key2
# ──────────────────────────────────────────────────────────────────────
# 5. _fetch_wiki_image
# ──────────────────────────────────────────────────────────────────────
class TestFetchWikiImage:
"""Integration-style tests for Wikipedia image fetching with mocked HTTP."""
def test_returns_thumbnail_url_when_present(self):
"""Mocks _http_get_json to return a thumbnail; function returns URL."""
with patch("src.services.recommender._http_get_json") as mock_get:
# First call: _resolve_wiki_title → Wikipedia search API
# Second call: REST summary API with thumbnail
def side_effect(url, **kwargs):
if "action=query" in url and "list=search" in url:
return {
"query": {
"search": [{"title": "Eiffel Tower"}]
}
}
if "rest_v1/page/summary" in url:
return {
"title": "Eiffel Tower",
"extract": "The Eiffel Tower is a wrought-iron lattice tower...",
"thumbnail": {"source": "https://upload.wikimedia.org/thumb_eiffel.jpg"},
}
return None
mock_get.side_effect = side_effect
result = _fetch_wiki_image("Eiffel Tower", city="Paris")
assert result == "https://upload.wikimedia.org/thumb_eiffel.jpg"
def test_returns_empty_string_when_no_thumbnail(self):
"""When no thumbnail is found across all candidates, returns ''."""
with patch("src.services.recommender._http_get_json") as mock_get:
def side_effect(url, **kwargs):
if "action=query" in url and "list=search" in url:
return {
"query": {
"search": [{"title": "Some Obscure Place"}]
}
}
if "rest_v1/page/summary" in url:
# Summary exists but no thumbnail
return {
"title": "Some Obscure Place",
"extract": "Some Obscure Place is a location...",
}
if "prop=pageimages" in url:
# pageimages API also has no thumbnail
return {
"query": {
"pages": {
"12345": {
"pageid": 12345,
"title": "Some Obscure Place",
}
}
}
}
return None
mock_get.side_effect = side_effect
result = _fetch_wiki_image("Some Obscure Place")
assert result == ""
def test_returns_empty_string_when_search_finds_nothing(self):
"""When Wikipedia search returns no results, returns ''."""
with patch("src.services.recommender._http_get_json") as mock_get:
def side_effect(url, **kwargs):
if "action=query" in url and "list=search" in url:
return {"query": {"search": []}}
return None
mock_get.side_effect = side_effect
result = _fetch_wiki_image("XyzzyNonexistentPlace")
assert result == ""
# ──────────────────────────────────────────────────────────────────────
# 6. _enrich_one_item — Pixabay fallback re-check
# ──────────────────────────────────────────────────────────────────────
class TestEnrichOneItemPixabayRecheck:
"""Tests for Pixabay fallback re-checking Wikipedia/Wikidata for specific images.
When Pixabay returns a generic stock photo, the function re-checks
Wikipedia/Wikidata tiers (which may have been rate-limited on first pass).
If a specific Wikimedia image is now available, it should be preferred.
"""
def test_prefers_wikipedia_over_pixabay_on_recheck(self):
"""After Pixabay returns a URL, re-check Wikipedia finds a specific
image -> use Wikipedia URL instead of Pixabay generic."""
item = {"name": "Adashino Nenbutsuji"}
with (
patch("src.services.recommender._IMAGE_CACHE", {}),
patch("src.services.recommender._fetch_wiki_image") as mock_wiki,
patch("src.services.recommender._fetch_wiki_image_multilang", return_value=""),
patch("src.services.recommender._fetch_wikidata_image", return_value=""),
patch("src.services.recommender._fetch_commons_image", return_value=""),
patch("src.services.recommender._fetch_local_name_image", return_value=""),
patch("src.services.recommender._fetch_pixabay_api_image", return_value="https://pixabay.com/generic.jpg"),
patch("src.services.recommender._fetch_unsplash_api_image", return_value=""),
patch("src.services.recommender._save_image_cache"),
):
# First call (Tier 1): Wikipedia fails (rate-limited)
# Second call (second chance after Pixabay): Wikipedia succeeds
mock_wiki.side_effect = ["", "https://upload.wikimedia.org/specific.jpg"]
_enrich_one_item(item, city="Kyoto")
assert item["image_url"] == "https://upload.wikimedia.org/specific.jpg"
def test_prefers_wikidata_over_pixabay_on_recheck(self):
"""After Pixabay returns a URL and Wikipedia still fails, re-check
Wikidata finds a specific image -> use Wikidata URL instead."""
item = {"name": "Some Temple"}
with (
patch("src.services.recommender._IMAGE_CACHE", {}),
patch("src.services.recommender._fetch_wiki_image", return_value=""),
patch("src.services.recommender._fetch_wiki_image_multilang", return_value=""),
patch("src.services.recommender._fetch_wikidata_image") as mock_wikidata,
patch("src.services.recommender._fetch_commons_image", return_value=""),
patch("src.services.recommender._fetch_local_name_image", return_value=""),
patch("src.services.recommender._fetch_pixabay_api_image", return_value="https://pixabay.com/generic.jpg"),
patch("src.services.recommender._fetch_unsplash_api_image", return_value=""),
patch("src.services.recommender._save_image_cache"),
):
# First call (Tier 3): Wikidata fails (rate-limited)
# Second call (second chance after Pixabay): Wikidata succeeds
mock_wikidata.side_effect = ["", "https://upload.wikimedia.org/commons/specific.jpg"]
_enrich_one_item(item, city="Kyoto")
assert item["image_url"] == "https://upload.wikimedia.org/commons/specific.jpg"
def test_keeps_pixabay_when_both_wikipedia_and_wikidata_still_fail(self):
"""When both re-checks still fail, Pixabay URL is used as-is."""
item = {"name": "Obscure Place"}
with (
patch("src.services.recommender._IMAGE_CACHE", {}),
patch("src.services.recommender._fetch_wiki_image", return_value=""),
patch("src.services.recommender._fetch_wiki_image_multilang", return_value=""),
patch("src.services.recommender._fetch_wikidata_image", return_value=""),
patch("src.services.recommender._fetch_commons_image", return_value=""),
patch("src.services.recommender._fetch_local_name_image", return_value=""),
patch("src.services.recommender._fetch_pixabay_api_image", return_value="https://pixabay.com/generic.jpg"),
patch("src.services.recommender._fetch_unsplash_api_image", return_value=""),
patch("src.services.recommender._save_image_cache"),
):
_enrich_one_item(item, city="Kyoto")
assert item["image_url"] == "https://pixabay.com/generic.jpg"