roamify / scripts /fix_images.py
jofaichow's picture
v0.0.9 — Full cache sweep + adaptive radius fix
83adb51
#!/usr/bin/env python3
"""Fix missing images across all cached cities using parallel enrichment."""
import sys, os, json, time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
from dotenv import load_dotenv
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env"), override=True)
from services.recommender import (
_LLM_CACHE, _IMAGE_CACHE, _save_image_cache,
_enrich_with_images,
)
# Collect all items that have no image_url
CITIES = ['Paris','London','Rome','Barcelona','New York','Tokyo','Bangkok','Sydney',
'Cape Town','Rio de Janeiro','Istanbul','Dubai','Seoul','Bali','Prague',
'San Francisco','Marrakech','Kyoto']
CATS = ['Landmark','Culture','Nature','Gems','Photo','Food','Shopping']
def cat_hash(name):
d = {c: (c==name) for c in CATS}
return json.dumps(d, sort_keys=True)
# Group missing-image items by city for parallel enrichment
by_city = {}
total_missing = 0
for city in CITIES:
city_items = []
for cat in CATS:
key = (city, cat_hash(cat))
items = _LLM_CACHE.get(key, [])
if items:
for item in items:
if not item.get("image_url"):
city_items.append(item)
if city_items:
by_city[city] = city_items
total_missing += len(city_items)
print(f'{city}: {len(city_items)} items missing images')
print(f'\nTotal items missing images: {total_missing}')
# Enrich each city's items in parallel (6 workers per batch)
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as pool:
futures = {}
for city, items in by_city.items():
f = pool.submit(_enrich_with_images, items, city=city)
futures[f] = city
for f in concurrent.futures.as_completed(futures):
city = futures[f]
try:
result = f.result()
fixed = sum(1 for it in result if it.get("image_url"))
print(f' {city}: fixed {fixed}/{len(by_city[city])} remaining')
except Exception as e:
print(f' {city}: error - {e}')
_save_image_cache()
# Final tally
still_missing = sum(1 for v in _LLM_CACHE.values() if v for it in v if not it.get("image_url"))
print(f'\nStill missing after fix: {still_missing} (from {total_missing})')
print(f'Image cache entries: {len(_IMAGE_CACHE)}')