| |
| """ |
| Unified warmup — generates + checks + saves, one (city × category) at a time. |
| |
| Replaces prewarm_cache.py and check_cache.py with a single iterative pipeline: |
| For each combo: |
| 1. Generate LLM recommendations (with per-provider logging) |
| 2. Save all 3 caches ← checkpoint |
| 3. Check items for missing images, fix what we can |
| 4. Save all 3 caches again ← fixes persisted |
| 5. Update progress file (resumable after crash) |
| |
| Usage: |
| cd roamify && python scripts/warmup.py |
| |
| Resume after Ctrl+C: |
| python scripts/warmup.py # skips completed, re-tries failed |
| """ |
|
|
| import os |
| import sys |
| import time |
| import json |
| from datetime import datetime |
|
|
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) |
|
|
| from dotenv import load_dotenv |
| load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env"), override=True) |
|
|
| from services.recommender import ( |
| get_recommendations_cached, |
| _LLM_CACHE, |
| _IMAGE_CACHE, |
| _GEOCODE_CACHE, |
| _save_llm_cache, |
| _save_image_cache, |
| _save_geocode_cache, |
| _enrich_one_item, |
| ) |
|
|
| import argparse |
|
|
| |
|
|
| CITIES = [ |
| "Paris", |
| "London", |
| "Rome", |
| "Barcelona", |
| "New York", |
| "Tokyo", |
| "Bangkok", |
| "Sydney", |
| "Cape Town", |
| "Rio de Janeiro", |
| "Istanbul", |
| "Dubai", |
| "Seoul", |
| "Bali", |
| "Prague", |
| "San Francisco", |
| "Marrakech", |
| "Kyoto", |
| "Hong Kong", |
| "Singapore", |
| "Amsterdam", |
| "Berlin", |
| "Vienna", |
| "Lisbon", |
| "Budapest", |
| "Athens", |
| "Mumbai", |
| "Mexico City", |
| ] |
|
|
| CATEGORIES = ["Landmark", "Culture", "Nature", "Gems", "Photo", "Food", "Shopping"] |
|
|
| PROGRESS_FILE = os.path.join(os.path.dirname(__file__), "..", ".warmup_progress.json") |
|
|
|
|
| def _cat_dict(cat_name: str) -> dict: |
| """Build categories dict matching the UI format — only one active.""" |
| return {name: (name == cat_name) for name in CATEGORIES} |
|
|
|
|
| def _cat_hash(cat_name: str) -> str: |
| return json.dumps(_cat_dict(cat_name), sort_keys=True) |
|
|
|
|
| |
|
|
| def load_progress() -> dict: |
| if not os.path.exists(PROGRESS_FILE): |
| return {"version": 1, "combos": {}} |
| try: |
| with open(PROGRESS_FILE) as f: |
| return json.load(f) |
| except (json.JSONDecodeError, OSError): |
| return {"version": 1, "combos": {}} |
|
|
|
|
| def save_progress(progress: dict) -> None: |
| with open(PROGRESS_FILE, "w") as f: |
| json.dump(progress, f, indent=2) |
|
|
|
|
| def combo_id(city: str, cat: str) -> str: |
| return f"{city}::{cat}" |
|
|
|
|
| def is_combo_done(progress: dict, cid: str) -> bool: |
| entry = progress["combos"].get(cid) |
| if entry is None: |
| return False |
| |
| return entry.get("status") == "success" |
|
|
|
|
| |
|
|
| def _is_city_fallback_url(url: str, items_in_city: list[dict]) -> bool: |
| """Check if a URL appears to be a city-level fallback (shared by ≥3 attractions).""" |
| if not url: |
| return False |
| count = sum(1 for it in items_in_city if it.get("image_url") == url) |
| return count >= 3 |
|
|
|
|
| def check_items(items: list[dict], city: str) -> dict: |
| """Check items for missing/bad enrichment. Returns {'fixed': int, 'still_missing': int, 'fallback_replaced': int}. |
| |
| Three passes: |
| 1. Items with NO image_url → clear cache + retry enrichment |
| 2. Items with a city fallback photo (shared by ≥3 items) → try to replace |
| 3. Second retry pass for stubborn items (longer delay to avoid rate limits) |
| """ |
| fixed = 0 |
| still_missing = 0 |
| fallback_replaced = 0 |
|
|
| |
| for item in items: |
| name = item.get("name", "???") |
| if not item.get("image_url"): |
| img_cache_key = (name, city, "") |
| if img_cache_key in _IMAGE_CACHE: |
| del _IMAGE_CACHE[img_cache_key] |
| _enrich_one_item(item, city=city) |
| if item.get("image_url"): |
| fixed += 1 |
| else: |
| still_missing += 1 |
| time.sleep(0.5) |
|
|
| |
| items_with_urls = [it for it in items if it.get("image_url")] |
| for item in items_with_urls: |
| url = item.get("image_url", "") |
| if _is_city_fallback_url(url, items_with_urls): |
| name = item.get("name", "???") |
| |
| img_cache_key = (name, city, "") |
| if img_cache_key in _IMAGE_CACHE: |
| del _IMAGE_CACHE[img_cache_key] |
| _enrich_one_item(item, city=city) |
| new_url = item.get("image_url", "") |
| if new_url and new_url != url: |
| fallback_replaced += 1 |
| time.sleep(0.5) |
|
|
| |
| for item in items: |
| name = item.get("name", "???") |
| if not item.get("image_url"): |
| img_cache_key = (name, city, "") |
| if img_cache_key in _IMAGE_CACHE: |
| del _IMAGE_CACHE[img_cache_key] |
| time.sleep(2.0) |
| _enrich_one_item(item, city=city) |
| if item.get("image_url"): |
| fixed += 1 |
| still_missing -= 1 |
|
|
| return {"fixed": fixed, "still_missing": still_missing, "fallback_replaced": fallback_replaced} |
|
|
|
|
| def check_duplicate_images(items: list[dict], city: str) -> dict: |
| """Find items sharing the same image URL within this batch and re-fetch.""" |
| fixed = 0 |
| still_fallback = 0 |
|
|
| url_map: dict[str, list[str]] = {} |
| for item in items: |
| url = item.get("image_url", "") |
| name = item.get("name", "???") |
| if url: |
| url_map.setdefault(url, []).append(name) |
|
|
| for url, names in url_map.items(): |
| if len(names) > 1: |
| for dup_name in names: |
| img_key = (dup_name, city, "") |
| if img_key in _IMAGE_CACHE and _IMAGE_CACHE[img_key] == url: |
| del _IMAGE_CACHE[img_key] |
| for item in items: |
| if item.get("name") == dup_name: |
| _enrich_one_item(item, city=city) |
| new_url = item.get("image_url", "") |
| if new_url and new_url != url: |
| fixed += 1 |
| else: |
| still_fallback += 1 |
| break |
| time.sleep(0.5) |
|
|
| return {"fixed": fixed, "still_fallback": still_fallback} |
|
|
|
|
| |
|
|
| def fmt_provider_line(log: list[dict]) -> str: |
| """Short readable provider chain summary, e.g. 'OR-DS ✅ 1.2s | Gemma ❌ | Gemini ✅ 3.4s'.""" |
| parts = [] |
| for entry in log: |
| name = entry.get("provider", "?") |
| label = { |
| "openrouter-deepseek": "OR-DS", |
| "ollama-cloud": "Ollama", |
| "openrouter-gemma": "Gemma", |
| "gemini": "Gemini", |
| }.get(name, name) |
| status = "✅" if entry.get("status") == "success" else "❌" |
| elapsed = entry.get("elapsed", "?") |
| items = entry.get("items", 0) |
| if entry.get("status") == "success": |
| parts.append(f"{label} {status} {elapsed}s ({items}it)") |
| else: |
| parts.append(f"{label} {status} {elapsed}s") |
| return " | ".join(parts) |
|
|
|
|
| |
|
|
| def warmup(cities_filter: list[str] | None = None, fix_only: bool = False): |
| cities = [c for c in CITIES if not cities_filter or c in cities_filter] |
| if not cities: |
| cities = CITIES |
|
|
| print("═" * 60) |
| print(" Roamify — Unified Warmup") |
| print(f" {len(cities)} cities × {len(CATEGORIES)} categories = {len(cities) * len(CATEGORIES)} combos") |
| print(f" Cities: {', '.join(cities)}") |
| print("═" * 60) |
|
|
| progress = load_progress() |
| llm_before = len(_LLM_CACHE) |
| img_before = len(_IMAGE_CACHE) |
| geo_before = len(_GEOCODE_CACHE) |
|
|
| stats = {"success": 0, "failed": 0, "skipped": 0, "images_fixed": 0} |
| total = len(cities) * len(CATEGORIES) |
| combo_idx = 0 |
|
|
| for city in cities: |
| for cat in CATEGORIES: |
| combo_idx += 1 |
| cid = combo_id(city, cat) |
|
|
| |
| if fix_only: |
| cat_h = _cat_hash(cat) |
| key = (city, cat_h) |
| items = _LLM_CACHE.get(key) |
| if not items: |
| print(f" [{combo_idx:>2}/{total}] ⏭️ {city} / {cat} — no cached data to fix") |
| continue |
| print(f" [{combo_idx:>2}/{total}] 🔧 {city} / {cat} ({len(items)} items)...", end=" ", flush=True) |
| |
| check_result = check_items(items, city) |
| dup_result = check_duplicate_images(items, city) |
| total_fixed = check_result["fixed"] + check_result["fallback_replaced"] + dup_result["fixed"] |
| if total_fixed > 0: |
| _save_image_cache() |
| stats["images_fixed"] += total_fixed |
| print(f"✅ fixed {total_fixed}") |
| else: |
| print("✅ no issues found") |
| continue |
|
|
| |
| if is_combo_done(progress, cid): |
| print(f" [{combo_idx:>2}/{total}] ⏭️ {city} / {cat} — already cached") |
| stats["skipped"] += 1 |
| continue |
|
|
| |
| cat_h = _cat_hash(cat) |
| key = (city, cat_h) |
| if key in _LLM_CACHE and _LLM_CACHE[key] is None: |
| del _LLM_CACHE[key] |
|
|
| |
| provider_log: list[dict] = [] |
| print(f" [{combo_idx:>2}/{total}] 🔍 {city} / {cat}...", end=" ", flush=True) |
| start = time.time() |
|
|
| try: |
| result = get_recommendations_cached( |
| city=city, |
| num_attractions=19, |
| categories=_cat_dict(cat), |
| temperature=0, |
| provider_log=provider_log, |
| ) |
| elapsed = time.time() - start |
| except Exception as e: |
| elapsed = time.time() - start |
| print(f"❌ {elapsed:.1f}s — {e}") |
| progress["combos"][cid] = { |
| "status": "failed", "elapsed": round(elapsed, 1), |
| "error": str(e), "timestamp": datetime.now().isoformat(), |
| } |
| save_progress(progress) |
| stats["failed"] += 1 |
| continue |
|
|
| |
| provider_line = fmt_provider_line(provider_log) |
|
|
| if result: |
| items = len(result) |
| print(f"✅ {items} items, {elapsed:.1f}s") |
| print(f" {provider_line}") |
|
|
| |
| _save_image_cache() |
| _save_geocode_cache() |
|
|
| |
| check_result = check_items(result, city) |
| if check_result["fixed"] > 0: |
| print(f" 📸 Fixed {check_result['fixed']} missing images") |
| _save_image_cache() |
| stats["images_fixed"] += check_result["fixed"] |
| if check_result["fallback_replaced"] > 0: |
| print(f" 🖼 Replaced {check_result['fallback_replaced']} city-fallback images") |
| _save_image_cache() |
| stats["images_fixed"] += check_result["fallback_replaced"] |
| if check_result["still_missing"] > 0: |
| print(f" ⚠️ {check_result['still_missing']} still missing (all tiers exhausted)") |
|
|
| |
| dup_result = check_duplicate_images(result, city) |
| if dup_result["fixed"] > 0: |
| print(f" 🖼 Re-fetched {dup_result['fixed']} duplicate images") |
| _save_image_cache() |
| stats["images_fixed"] += dup_result["fixed"] |
| if dup_result["still_fallback"] > 0: |
| print(f" ⚠️ {dup_result['still_fallback']} still using fallback") |
|
|
| progress["combos"][cid] = { |
| "status": "success", "items": items, |
| "elapsed": round(elapsed, 1), |
| "provider_chain": provider_log, |
| "timestamp": datetime.now().isoformat(), |
| } |
| stats["success"] += 1 |
| else: |
| print(f"❌ returned None, {elapsed:.1f}s") |
| print(f" {provider_line}") |
| progress["combos"][cid] = { |
| "status": "failed", "elapsed": round(elapsed, 1), |
| "provider_chain": provider_log, |
| "error": "all providers returned None", |
| "timestamp": datetime.now().isoformat(), |
| } |
| stats["failed"] += 1 |
|
|
| save_progress(progress) |
|
|
| |
| if combo_idx < total: |
| time.sleep(1.5) |
|
|
| |
| llm_new = len(_LLM_CACHE) - llm_before |
| img_new = len(_IMAGE_CACHE) - img_before |
| geo_new = len(_GEOCODE_CACHE) - geo_before |
|
|
| print() |
| print("═" * 60) |
| print(" Warmup complete!") |
| print(f" Combos: {stats['success']} ✅ · {stats['failed']} ❌ · {stats['skipped']} ⏭️") |
| print(f" Images fixed during check: {stats['images_fixed']}") |
| print(f" Cache entries: LLM {llm_before} → {len(_LLM_CACHE)} (+{llm_new})") |
| print(f" Images {img_before} → {len(_IMAGE_CACHE)} (+{img_new})") |
| print(f" Geocode {geo_before} → {len(_GEOCODE_CACHE)} (+{geo_new})") |
| print() |
|
|
| |
| failed_combos = [cid for cid, e in progress["combos"].items() if e.get("status") == "failed"] |
| if failed_combos: |
| print(" Failed combos (will retry next run):") |
| for cid in failed_combos: |
| print(f" ❌ {cid.replace('::', ' / ')}") |
| print("═" * 60) |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Warmup caches for Roamify") |
| parser.add_argument("--city", "-c", action="append", dest="cities", |
| help="Only warmup specific city(ies). Repeat for multiple: -c Tokyo -c Paris") |
| parser.add_argument("--fix", action="store_true", |
| help="Fix-only mode: skip LLM generation, only re-check images on cached entries") |
| args = parser.parse_args() |
| warmup(cities_filter=args.cities, fix_only=args.fix) |
|
|