jofaichow commited on
Commit
d24bd0d
·
1 Parent(s): 3c1e28a

fix: prefer Wikimedia images over Pixabay stock photos on re-check

Browse files

- After Pixabay returns an image, re-check English Wikipedia and
Wikidata tiers in case they were rate-limited on first pass
- Added multi-language Wikipedia tier (fr/de/es/it/ja) as Tier 2
- Fixed cache check to skip empty-string entries (re-try failed items)
- Cache: non-empty URLs only — empty cache entries no longer block retries

.cache/image_cache.json CHANGED
The diff for this file is too large to render. See raw diff
 
src/services/recommender.py CHANGED
@@ -328,6 +328,64 @@ def _fetch_wiki_image(name: str, city: str = "") -> str:
328
  return ""
329
 
330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  # Tourism-related keywords to disambiguate Wikidata results
332
  _TOURISM_KEYWORDS = {
333
  "church", "cathedral", "basilica", "monument", "museum", "palace",
@@ -512,6 +570,42 @@ def _fetch_local_name_image(name: str, city: str = "", country: str = "") -> str
512
  return ""
513
 
514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
  def _fetch_unsplash_api_image(name: str, city: str = "", country: str = "") -> str:
516
  """Tier 6: Search Unsplash for a high-quality landscape photo.
517
  Only called when all Wikimedia sources fail. Uses orientation=landscape
@@ -551,12 +645,14 @@ def _fetch_unsplash_api_image(name: str, city: str = "", country: str = "") -> s
551
 
552
 
553
  def _enrich_one_item(item: dict, city: str = "", country: str = "") -> None:
554
- """Look up image for a single item using 5-tier fallback:
555
- 1. Wikipedia REST/pageimages API
556
- 2. Wikidata P18 image claim (with city/country context)
557
- 3. Wikimedia Commons search (with simplified name variants embedded)
558
- 4. Local name from parentheses (e.g. Koko-shima from Awaji Island)
559
- 5. Unsplash search (landscape orientation, last resort)
 
 
560
 
561
  Results are cached in _IMAGE_CACHE to avoid repeat API calls across searches.
562
  """
@@ -567,42 +663,66 @@ def _enrich_one_item(item: dict, city: str = "", country: str = "") -> None:
567
  item["image_url"] = ""
568
  return
569
 
570
- # Check image cache first
 
571
  cache_key = (name, city or "", country or "")
572
  cached_url = _IMAGE_CACHE.get(cache_key)
573
- if cached_url is not None:
574
  item["image_url"] = cached_url
575
  return
576
 
577
- # Tier 1: Wikipedia
578
  url = _fetch_wiki_image(name, city=city)
579
  if url:
580
  _IMAGE_CACHE[cache_key] = url
581
  item["image_url"] = url
582
  _save_image_cache()
583
  return
584
- # Tier 2: Wikidata (with city/country for disambiguation)
 
 
 
 
 
 
 
585
  url = _fetch_wikidata_image(name, city=city, country=country)
586
  if url:
587
  _IMAGE_CACHE[cache_key] = url
588
  item["image_url"] = url
589
  _save_image_cache()
590
  return
591
- # Tier 3: Wikimedia Commons (includes simplified/variant names)
592
  url = _fetch_commons_image(name, city=city, country=country)
593
  if url:
594
  _IMAGE_CACHE[cache_key] = url
595
  item["image_url"] = url
596
  _save_image_cache()
597
  return
598
- # Tier 4: Local name from parentheses
599
  url = _fetch_local_name_image(name, city=city, country=country)
600
  if url:
601
  _IMAGE_CACHE[cache_key] = url
602
  item["image_url"] = url
603
  _save_image_cache()
604
  return
605
- # Tier 5: Unsplash (landscape only, last resort)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
  url = _fetch_unsplash_api_image(name, city=city, country=country)
607
  if url:
608
  _IMAGE_CACHE[cache_key] = url
 
328
  return ""
329
 
330
 
331
+ _MULTILANG_WIKI = ["fr", "de", "es", "it", "ja"]
332
+
333
+
334
+ def _fetch_wiki_image_multilang(name: str, city: str = "") -> str:
335
+ """Tier 1.5: Search non-English Wikipedias for an image.
336
+ When English Wikipedia has no thumbnail, try French, German, Spanish,
337
+ Italian, and Japanese editions in parallel — the next largest by article
338
+ count and rich in travel-related imagery.
339
+ """
340
+ clean = re.sub(r"\s*\(.*?\)\s*$", "", name).strip()
341
+ search_terms = [clean] if clean and clean != name else [clean, name]
342
+ if city:
343
+ search_terms.append(f"{clean}, {city}" if clean else f"{name}, {city}")
344
+
345
+ import concurrent.futures
346
+
347
+ def _try_lang(lang: str) -> str:
348
+ # Try just the cleaned name (most likely to match across languages)
349
+ for term in search_terms[:2]: # try at most 2 terms
350
+ if not term:
351
+ continue
352
+ try:
353
+ url = f"https://{lang}.wikipedia.org/w/api.php?" + urllib.parse.urlencode({
354
+ "action": "query",
355
+ "generator": "search",
356
+ "gsrsearch": term,
357
+ "gsrlimit": 3,
358
+ "prop": "pageimages",
359
+ "pithumbsize": 500,
360
+ "format": "json",
361
+ })
362
+ req = urllib.request.Request(url, headers={"User-Agent": "TravelPlanner/1.0"})
363
+ with urllib.request.urlopen(req, timeout=3) as resp:
364
+ data = json.loads(resp.read().decode())
365
+ pages = data.get("query", {}).get("pages", {})
366
+ for page in pages.values():
367
+ thumb = page.get("thumbnail", {}).get("source", "")
368
+ if thumb:
369
+ return thumb
370
+ except Exception:
371
+ continue
372
+ return ""
373
+
374
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as pool:
375
+ futures = {pool.submit(_try_lang, lang): lang for lang in _MULTILANG_WIKI}
376
+ for f in concurrent.futures.as_completed(futures):
377
+ try:
378
+ result = f.result(timeout=5)
379
+ if result:
380
+ # Cancel remaining futures — we found one
381
+ for other in futures:
382
+ other.cancel()
383
+ return result
384
+ except Exception:
385
+ continue
386
+ return ""
387
+
388
+
389
  # Tourism-related keywords to disambiguate Wikidata results
390
  _TOURISM_KEYWORDS = {
391
  "church", "cathedral", "basilica", "monument", "museum", "palace",
 
570
  return ""
571
 
572
 
573
+ def _fetch_pixabay_api_image(name: str, city: str = "", country: str = "") -> str:
574
+ """Tier 5: Search Pixabay for a high-quality landscape photo.
575
+ Free tier: 100 req/min, no monthly cap. Much better limits than Unsplash.
576
+ """
577
+ pixabay_key = os.environ.get("PIXABAY_API_KEY", "")
578
+ if not pixabay_key:
579
+ return ""
580
+
581
+ # Build search query: name + city for better relevance
582
+ clean = re.sub(r"\s*\(.*?\)\s*$", "", name).strip()
583
+ query = clean
584
+ if city:
585
+ query = f"{clean} {city}"
586
+ elif country:
587
+ query = f"{clean} {country}"
588
+
589
+ search_url = "https://pixabay.com/api/?" + urllib.parse.urlencode({
590
+ "key": pixabay_key,
591
+ "q": query,
592
+ "per_page": 3,
593
+ "orientation": "horizontal",
594
+ "safesearch": "true",
595
+ "image_type": "photo",
596
+ })
597
+ try:
598
+ with urllib.request.urlopen(search_url, timeout=8) as resp:
599
+ data = json.loads(resp.read().decode())
600
+ hits = data.get("hits", [])
601
+ if hits:
602
+ # webformatURL is ~640px wide — perfect for cards
603
+ return hits[0]["webformatURL"]
604
+ except Exception:
605
+ pass
606
+ return ""
607
+
608
+
609
  def _fetch_unsplash_api_image(name: str, city: str = "", country: str = "") -> str:
610
  """Tier 6: Search Unsplash for a high-quality landscape photo.
611
  Only called when all Wikimedia sources fail. Uses orientation=landscape
 
645
 
646
 
647
  def _enrich_one_item(item: dict, city: str = "", country: str = "") -> None:
648
+ """Look up image for a single item using 7-tier fallback:
649
+ 1. Wikipedia REST/pageimages API (English)
650
+ 2. Wikipedia REST/pageimages API (French, German, Spanish, Italian, Japanese)
651
+ 3. Wikidata P18 image claim (with city/country context)
652
+ 4. Wikimedia Commons search (with simplified name variants embedded)
653
+ 5. Local name from parentheses (e.g. Koko-shima from Awaji Island)
654
+ 6. Pixabay search (free tier, no monthly cap)
655
+ 7. Unsplash search (landscape orientation, last resort)
656
 
657
  Results are cached in _IMAGE_CACHE to avoid repeat API calls across searches.
658
  """
 
663
  item["image_url"] = ""
664
  return
665
 
666
+ # Check image cache first (only use cached if it's a real URL — empty strings
667
+ # mean the item was never successfully resolved, so re-try)
668
  cache_key = (name, city or "", country or "")
669
  cached_url = _IMAGE_CACHE.get(cache_key)
670
+ if cached_url: # truthy check — non-empty URL only
671
  item["image_url"] = cached_url
672
  return
673
 
674
+ # Tier 1: Wikipedia (English)
675
  url = _fetch_wiki_image(name, city=city)
676
  if url:
677
  _IMAGE_CACHE[cache_key] = url
678
  item["image_url"] = url
679
  _save_image_cache()
680
  return
681
+ # Tier 2: Wikipedia (multi-language fr, de, es, it, ja)
682
+ url = _fetch_wiki_image_multilang(name, city=city)
683
+ if url:
684
+ _IMAGE_CACHE[cache_key] = url
685
+ item["image_url"] = url
686
+ _save_image_cache()
687
+ return
688
+ # Tier 3: Wikidata (with city/country for disambiguation)
689
  url = _fetch_wikidata_image(name, city=city, country=country)
690
  if url:
691
  _IMAGE_CACHE[cache_key] = url
692
  item["image_url"] = url
693
  _save_image_cache()
694
  return
695
+ # Tier 4: Wikimedia Commons (includes simplified/variant names)
696
  url = _fetch_commons_image(name, city=city, country=country)
697
  if url:
698
  _IMAGE_CACHE[cache_key] = url
699
  item["image_url"] = url
700
  _save_image_cache()
701
  return
702
+ # Tier 5: Local name from parentheses
703
  url = _fetch_local_name_image(name, city=city, country=country)
704
  if url:
705
  _IMAGE_CACHE[cache_key] = url
706
  item["image_url"] = url
707
  _save_image_cache()
708
  return
709
+ # Tier 6: Pixabay (free tier, no monthly cap)
710
+ url = _fetch_pixabay_api_image(name, city=city, country=country)
711
+ if url:
712
+ # Second chance: re-check Wikipedia/Wikidata tiers in case they were
713
+ # rate-limited or unavailable on the first pass. Pixabay often returns
714
+ # generic stock photos (same image under different query URLs); prefer
715
+ # a specific Wikimedia image when one exists.
716
+ wiki_url = _fetch_wiki_image(name, city=city)
717
+ if not wiki_url:
718
+ wiki_url = _fetch_wikidata_image(name, city=city, country=country)
719
+ if wiki_url:
720
+ url = wiki_url
721
+ _IMAGE_CACHE[cache_key] = url
722
+ item["image_url"] = url
723
+ _save_image_cache()
724
+ return
725
+ # Tier 7: Unsplash (landscape only, last resort)
726
  url = _fetch_unsplash_api_image(name, city=city, country=country)
727
  if url:
728
  _IMAGE_CACHE[cache_key] = url
tests/test_recommender.py CHANGED
@@ -7,6 +7,7 @@ import pytest
7
 
8
  # Import the units under test from the recommender module
9
  from src.services.recommender import (
 
10
  _fetch_wiki_image,
11
  _haversine_km,
12
  _is_media_entertainment_page,
@@ -273,3 +274,82 @@ class TestFetchWikiImage:
273
 
274
  result = _fetch_wiki_image("XyzzyNonexistentPlace")
275
  assert result == ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # Import the units under test from the recommender module
9
  from src.services.recommender import (
10
+ _enrich_one_item,
11
  _fetch_wiki_image,
12
  _haversine_km,
13
  _is_media_entertainment_page,
 
274
 
275
  result = _fetch_wiki_image("XyzzyNonexistentPlace")
276
  assert result == ""
277
+
278
+
279
+ # ──────────────────────────────────────────────────────────────────────
280
+ # 6. _enrich_one_item — Pixabay fallback re-check
281
+ # ──────────────────────────────────────────────────────────────────────
282
+ class TestEnrichOneItemPixabayRecheck:
283
+ """Tests for Pixabay fallback re-checking Wikipedia/Wikidata for specific images.
284
+
285
+ When Pixabay returns a generic stock photo, the function re-checks
286
+ Wikipedia/Wikidata tiers (which may have been rate-limited on first pass).
287
+ If a specific Wikimedia image is now available, it should be preferred.
288
+ """
289
+
290
+ def test_prefers_wikipedia_over_pixabay_on_recheck(self):
291
+ """After Pixabay returns a URL, re-check Wikipedia finds a specific
292
+ image -> use Wikipedia URL instead of Pixabay generic."""
293
+ item = {"name": "Adashino Nenbutsuji"}
294
+
295
+ with (
296
+ patch("src.services.recommender._IMAGE_CACHE", {}),
297
+ patch("src.services.recommender._fetch_wiki_image") as mock_wiki,
298
+ patch("src.services.recommender._fetch_wiki_image_multilang", return_value=""),
299
+ patch("src.services.recommender._fetch_wikidata_image", return_value=""),
300
+ patch("src.services.recommender._fetch_commons_image", return_value=""),
301
+ patch("src.services.recommender._fetch_local_name_image", return_value=""),
302
+ patch("src.services.recommender._fetch_pixabay_api_image", return_value="https://pixabay.com/generic.jpg"),
303
+ patch("src.services.recommender._fetch_unsplash_api_image", return_value=""),
304
+ patch("src.services.recommender._save_image_cache"),
305
+ ):
306
+ # First call (Tier 1): Wikipedia fails (rate-limited)
307
+ # Second call (second chance after Pixabay): Wikipedia succeeds
308
+ mock_wiki.side_effect = ["", "https://upload.wikimedia.org/specific.jpg"]
309
+
310
+ _enrich_one_item(item, city="Kyoto")
311
+
312
+ assert item["image_url"] == "https://upload.wikimedia.org/specific.jpg"
313
+
314
+ def test_prefers_wikidata_over_pixabay_on_recheck(self):
315
+ """After Pixabay returns a URL and Wikipedia still fails, re-check
316
+ Wikidata finds a specific image -> use Wikidata URL instead."""
317
+ item = {"name": "Some Temple"}
318
+
319
+ with (
320
+ patch("src.services.recommender._IMAGE_CACHE", {}),
321
+ patch("src.services.recommender._fetch_wiki_image", return_value=""),
322
+ patch("src.services.recommender._fetch_wiki_image_multilang", return_value=""),
323
+ patch("src.services.recommender._fetch_wikidata_image") as mock_wikidata,
324
+ patch("src.services.recommender._fetch_commons_image", return_value=""),
325
+ patch("src.services.recommender._fetch_local_name_image", return_value=""),
326
+ patch("src.services.recommender._fetch_pixabay_api_image", return_value="https://pixabay.com/generic.jpg"),
327
+ patch("src.services.recommender._fetch_unsplash_api_image", return_value=""),
328
+ patch("src.services.recommender._save_image_cache"),
329
+ ):
330
+ # First call (Tier 3): Wikidata fails (rate-limited)
331
+ # Second call (second chance after Pixabay): Wikidata succeeds
332
+ mock_wikidata.side_effect = ["", "https://upload.wikimedia.org/commons/specific.jpg"]
333
+
334
+ _enrich_one_item(item, city="Kyoto")
335
+
336
+ assert item["image_url"] == "https://upload.wikimedia.org/commons/specific.jpg"
337
+
338
+ def test_keeps_pixabay_when_both_wikipedia_and_wikidata_still_fail(self):
339
+ """When both re-checks still fail, Pixabay URL is used as-is."""
340
+ item = {"name": "Obscure Place"}
341
+
342
+ with (
343
+ patch("src.services.recommender._IMAGE_CACHE", {}),
344
+ patch("src.services.recommender._fetch_wiki_image", return_value=""),
345
+ patch("src.services.recommender._fetch_wiki_image_multilang", return_value=""),
346
+ patch("src.services.recommender._fetch_wikidata_image", return_value=""),
347
+ patch("src.services.recommender._fetch_commons_image", return_value=""),
348
+ patch("src.services.recommender._fetch_local_name_image", return_value=""),
349
+ patch("src.services.recommender._fetch_pixabay_api_image", return_value="https://pixabay.com/generic.jpg"),
350
+ patch("src.services.recommender._fetch_unsplash_api_image", return_value=""),
351
+ patch("src.services.recommender._save_image_cache"),
352
+ ):
353
+ _enrich_one_item(item, city="Kyoto")
354
+
355
+ assert item["image_url"] == "https://pixabay.com/generic.jpg"