File size: 7,914 Bytes
4668bf6 8088a69 4668bf6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | #!/usr/bin/env python3
"""
Cache health check — scans and repairs all 3 disk caches.
Checks every cached LLM recommendation for:
🖼 Missing image URLs → re-runs enrichment (Tier 1-6, including city fallback)
📍 Missing/bad coordinates → re-runs geocode verification
🗺 Wrong-city attractions → flags them
Usage:
cd roamify && python scripts/check_cache.py [--report-only]
"""
import os
import sys
import time
import json
import argparse
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
from dotenv import load_dotenv
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env"), override=True)
from services.recommender import (
_LLM_CACHE,
_IMAGE_CACHE,
_GEOCODE_CACHE,
_save_llm_cache,
_save_image_cache,
_save_geocode_cache,
_enrich_one_item,
_verify_coordinates,
)
def check_cache(report_only: bool = False) -> dict:
"""Scan all LLM cache entries, find and repair issues. Returns summary."""
stats = {
"entries_scanned": 0,
"items_checked": 0,
"images_missing": 0,
"images_fixed": 0,
"coordinates_missing": 0,
"coordinates_fixed": 0,
"coordinates_out_of_city": 0,
}
# Snapshot the cache keys to avoid modification during iteration
keys = list(_LLM_CACHE.keys())
for key in keys:
city = key[0]
items = _LLM_CACHE[key]
if not items:
continue
stats["entries_scanned"] += 1
fixed_this_entry = False
for item in items:
stats["items_checked"] += 1
name = item.get("name", "???")
# ── 1. Check image ──
img_url = item.get("image_url", "")
if not img_url:
stats["images_missing"] += 1
if not report_only:
print(f" 🖼 Missing image: {city} → {name}")
# Clear stale image cache entry so _enrich_one_item retries
img_cache_key = (name, city, "")
if img_cache_key in _IMAGE_CACHE:
del _IMAGE_CACHE[img_cache_key]
_enrich_one_item(item, city=city)
if item.get("image_url"):
stats["images_fixed"] += 1
fixed_this_entry = True
print(f" ✅ Fixed")
else:
print(f" ⚠️ Still no image (all tiers exhausted)")
time.sleep(0.5) # Rate-limit friendly pause
# ── 2. Check coordinates ──
lat = item.get("latitude")
lon = item.get("longitude")
coords_bad = (
lat is None or lon is None
or str(lat).strip() == "" or str(lon).strip() == ""
or float(lat) == 0 or float(lon) == 0
)
if coords_bad:
stats["coordinates_missing"] += 1
if not report_only:
print(f" 📍 Bad coords: {city} → {name} (lat={lat}, lon={lon})")
# Re-verify on a single-item list
verified = _verify_coordinates([item], city)
if verified and verified[0].get("latitude") and verified[0].get("longitude"):
stats["coordinates_fixed"] += 1
fixed_this_entry = True
# Copy verified item data back
for k, v in verified[0].items():
item[k] = v
print(f" ✅ Fixed → ({item['latitude']}, {item['longitude']})")
else:
print(f" ⚠️ Still bad — skipping repair")
time.sleep(1.0) # Nominatim rate limit
if fixed_this_entry:
print()
# ── 3. Check for duplicate city fallback images ──
# Group images per city, flag URLs that appear more than once
city_images: dict[str, dict[str, list[str]]] = {}
for key in keys:
city = key[0]
items = _LLM_CACHE[key]
if not items:
continue
if city not in city_images:
city_images[city] = {}
for item in items:
url = item.get("image_url", "")
name = item.get("name", "???")
if url:
city_images[city].setdefault(url, []).append(name)
for city, url_map in city_images.items():
for url, names in url_map.items():
if len(names) > 1:
# Same image shared by multiple attractions — likely city fallback
print(f" 🔁 Duplicate image in {city}: {len(names)} attractions share the same photo")
for dup_name in names:
print(f" → {dup_name}")
if not report_only:
for dup_name in names:
img_cache_key = (dup_name, city, "")
if img_cache_key in _IMAGE_CACHE and _IMAGE_CACHE[img_cache_key] == url:
del _IMAGE_CACHE[img_cache_key]
# Re-enrich to find a specific image
for key in keys:
if key[0] == city:
items = _LLM_CACHE[key]
if not items:
continue
for item in items:
if item.get("name") == dup_name:
_enrich_one_item(item, city=city)
new_url = item.get("image_url", "")
if new_url and new_url != url:
print(f" ✅ Re-fetched unique image for {dup_name}")
else:
print(f" ⚠️ Still using fallback for {dup_name}")
break
stats["images_fixed"] += 1
time.sleep(0.5)
# ── Summary ──
return stats
def main():
parser = argparse.ArgumentParser(description="Cache health check for Roamify")
parser.add_argument("--report-only", action="store_true",
help="Only report issues, don't fix them")
args = parser.parse_args()
llm_before = len(_LLM_CACHE)
img_before = len(_IMAGE_CACHE)
geo_before = len(_GEOCODE_CACHE)
action = "Reporting" if args.report_only else "Scanning & repairing"
print(f"{action} caches...")
print(f" LLM entries: {llm_before}")
print(f" Image entries: {img_before}")
print(f" Geocode entries: {geo_before}")
print()
stats = check_cache(report_only=args.report_only)
print()
print("═" * 55)
print("Cache health check complete!")
print(f" Entries scanned: {stats['entries_scanned']}")
print(f" Items checked: {stats['items_checked']}")
print(f" Missing images: {stats['images_missing']} → fixed: {stats['images_fixed']}")
print(f" Bad coords: {stats['coordinates_missing']} → fixed: {stats['coordinates_fixed']}")
llm_after = len(_LLM_CACHE)
img_after = len(_IMAGE_CACHE)
geo_after = len(_GEOCODE_CACHE)
print()
print(f" LLM entries: {llm_before} → {llm_after}")
print(f" Image entries: {img_before} → {img_after}")
print(f" Geocode entries: {geo_before} → {geo_after}")
if not args.report_only:
_save_llm_cache()
_save_image_cache()
_save_geocode_cache()
print()
print("All caches saved to disk ✅")
if __name__ == "__main__":
main()
|