roamify / scripts /check_cache.py
jofaichow's picture
Unified warmup script + provider observability + Tokyo caches
8088a69
#!/usr/bin/env python3
"""
Cache health check — scans and repairs all 3 disk caches.
Checks every cached LLM recommendation for:
🖼 Missing image URLs → re-runs enrichment (Tier 1-6, including city fallback)
📍 Missing/bad coordinates → re-runs geocode verification
🗺 Wrong-city attractions → flags them
Usage:
cd roamify && python scripts/check_cache.py [--report-only]
"""
import os
import sys
import time
import json
import argparse
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
from dotenv import load_dotenv
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env"), override=True)
from services.recommender import (
_LLM_CACHE,
_IMAGE_CACHE,
_GEOCODE_CACHE,
_save_llm_cache,
_save_image_cache,
_save_geocode_cache,
_enrich_one_item,
_verify_coordinates,
)
def check_cache(report_only: bool = False) -> dict:
"""Scan all LLM cache entries, find and repair issues. Returns summary."""
stats = {
"entries_scanned": 0,
"items_checked": 0,
"images_missing": 0,
"images_fixed": 0,
"coordinates_missing": 0,
"coordinates_fixed": 0,
"coordinates_out_of_city": 0,
}
# Snapshot the cache keys to avoid modification during iteration
keys = list(_LLM_CACHE.keys())
for key in keys:
city = key[0]
items = _LLM_CACHE[key]
if not items:
continue
stats["entries_scanned"] += 1
fixed_this_entry = False
for item in items:
stats["items_checked"] += 1
name = item.get("name", "???")
# ── 1. Check image ──
img_url = item.get("image_url", "")
if not img_url:
stats["images_missing"] += 1
if not report_only:
print(f" 🖼 Missing image: {city}{name}")
# Clear stale image cache entry so _enrich_one_item retries
img_cache_key = (name, city, "")
if img_cache_key in _IMAGE_CACHE:
del _IMAGE_CACHE[img_cache_key]
_enrich_one_item(item, city=city)
if item.get("image_url"):
stats["images_fixed"] += 1
fixed_this_entry = True
print(f" ✅ Fixed")
else:
print(f" ⚠️ Still no image (all tiers exhausted)")
time.sleep(0.5) # Rate-limit friendly pause
# ── 2. Check coordinates ──
lat = item.get("latitude")
lon = item.get("longitude")
coords_bad = (
lat is None or lon is None
or str(lat).strip() == "" or str(lon).strip() == ""
or float(lat) == 0 or float(lon) == 0
)
if coords_bad:
stats["coordinates_missing"] += 1
if not report_only:
print(f" 📍 Bad coords: {city}{name} (lat={lat}, lon={lon})")
# Re-verify on a single-item list
verified = _verify_coordinates([item], city)
if verified and verified[0].get("latitude") and verified[0].get("longitude"):
stats["coordinates_fixed"] += 1
fixed_this_entry = True
# Copy verified item data back
for k, v in verified[0].items():
item[k] = v
print(f" ✅ Fixed → ({item['latitude']}, {item['longitude']})")
else:
print(f" ⚠️ Still bad — skipping repair")
time.sleep(1.0) # Nominatim rate limit
if fixed_this_entry:
print()
# ── 3. Check for duplicate city fallback images ──
# Group images per city, flag URLs that appear more than once
city_images: dict[str, dict[str, list[str]]] = {}
for key in keys:
city = key[0]
items = _LLM_CACHE[key]
if not items:
continue
if city not in city_images:
city_images[city] = {}
for item in items:
url = item.get("image_url", "")
name = item.get("name", "???")
if url:
city_images[city].setdefault(url, []).append(name)
for city, url_map in city_images.items():
for url, names in url_map.items():
if len(names) > 1:
# Same image shared by multiple attractions — likely city fallback
print(f" 🔁 Duplicate image in {city}: {len(names)} attractions share the same photo")
for dup_name in names:
print(f" → {dup_name}")
if not report_only:
for dup_name in names:
img_cache_key = (dup_name, city, "")
if img_cache_key in _IMAGE_CACHE and _IMAGE_CACHE[img_cache_key] == url:
del _IMAGE_CACHE[img_cache_key]
# Re-enrich to find a specific image
for key in keys:
if key[0] == city:
items = _LLM_CACHE[key]
if not items:
continue
for item in items:
if item.get("name") == dup_name:
_enrich_one_item(item, city=city)
new_url = item.get("image_url", "")
if new_url and new_url != url:
print(f" ✅ Re-fetched unique image for {dup_name}")
else:
print(f" ⚠️ Still using fallback for {dup_name}")
break
stats["images_fixed"] += 1
time.sleep(0.5)
# ── Summary ──
return stats
def main():
parser = argparse.ArgumentParser(description="Cache health check for Roamify")
parser.add_argument("--report-only", action="store_true",
help="Only report issues, don't fix them")
args = parser.parse_args()
llm_before = len(_LLM_CACHE)
img_before = len(_IMAGE_CACHE)
geo_before = len(_GEOCODE_CACHE)
action = "Reporting" if args.report_only else "Scanning & repairing"
print(f"{action} caches...")
print(f" LLM entries: {llm_before}")
print(f" Image entries: {img_before}")
print(f" Geocode entries: {geo_before}")
print()
stats = check_cache(report_only=args.report_only)
print()
print("═" * 55)
print("Cache health check complete!")
print(f" Entries scanned: {stats['entries_scanned']}")
print(f" Items checked: {stats['items_checked']}")
print(f" Missing images: {stats['images_missing']} → fixed: {stats['images_fixed']}")
print(f" Bad coords: {stats['coordinates_missing']} → fixed: {stats['coordinates_fixed']}")
llm_after = len(_LLM_CACHE)
img_after = len(_IMAGE_CACHE)
geo_after = len(_GEOCODE_CACHE)
print()
print(f" LLM entries: {llm_before}{llm_after}")
print(f" Image entries: {img_before}{img_after}")
print(f" Geocode entries: {geo_before}{geo_after}")
if not args.report_only:
_save_llm_cache()
_save_image_cache()
_save_geocode_cache()
print()
print("All caches saved to disk ✅")
if __name__ == "__main__":
main()