#!/usr/bin/env python3 """Fix missing images across all cached cities using parallel enrichment.""" import sys, os, json, time sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src")) from dotenv import load_dotenv load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env"), override=True) from services.recommender import ( _LLM_CACHE, _IMAGE_CACHE, _save_image_cache, _enrich_with_images, ) # Collect all items that have no image_url CITIES = ['Paris','London','Rome','Barcelona','New York','Tokyo','Bangkok','Sydney', 'Cape Town','Rio de Janeiro','Istanbul','Dubai','Seoul','Bali','Prague', 'San Francisco','Marrakech','Kyoto'] CATS = ['Landmark','Culture','Nature','Gems','Photo','Food','Shopping'] def cat_hash(name): d = {c: (c==name) for c in CATS} return json.dumps(d, sort_keys=True) # Group missing-image items by city for parallel enrichment by_city = {} total_missing = 0 for city in CITIES: city_items = [] for cat in CATS: key = (city, cat_hash(cat)) items = _LLM_CACHE.get(key, []) if items: for item in items: if not item.get("image_url"): city_items.append(item) if city_items: by_city[city] = city_items total_missing += len(city_items) print(f'{city}: {len(city_items)} items missing images') print(f'\nTotal items missing images: {total_missing}') # Enrich each city's items in parallel (6 workers per batch) import concurrent.futures with concurrent.futures.ThreadPoolExecutor(max_workers=4) as pool: futures = {} for city, items in by_city.items(): f = pool.submit(_enrich_with_images, items, city=city) futures[f] = city for f in concurrent.futures.as_completed(futures): city = futures[f] try: result = f.result() fixed = sum(1 for it in result if it.get("image_url")) print(f' {city}: fixed {fixed}/{len(by_city[city])} remaining') except Exception as e: print(f' {city}: error - {e}') _save_image_cache() # Final tally still_missing = sum(1 for v in _LLM_CACHE.values() if v for it in v if not it.get("image_url")) print(f'\nStill missing after fix: {still_missing} (from {total_missing})') print(f'Image cache entries: {len(_IMAGE_CACHE)}')