File size: 2,362 Bytes
83adb51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | #!/usr/bin/env python3
"""Fix missing images across all cached cities using parallel enrichment."""
import sys, os, json, time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
from dotenv import load_dotenv
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env"), override=True)
from services.recommender import (
_LLM_CACHE, _IMAGE_CACHE, _save_image_cache,
_enrich_with_images,
)
# Collect all items that have no image_url
CITIES = ['Paris','London','Rome','Barcelona','New York','Tokyo','Bangkok','Sydney',
'Cape Town','Rio de Janeiro','Istanbul','Dubai','Seoul','Bali','Prague',
'San Francisco','Marrakech','Kyoto']
CATS = ['Landmark','Culture','Nature','Gems','Photo','Food','Shopping']
def cat_hash(name):
d = {c: (c==name) for c in CATS}
return json.dumps(d, sort_keys=True)
# Group missing-image items by city for parallel enrichment
by_city = {}
total_missing = 0
for city in CITIES:
city_items = []
for cat in CATS:
key = (city, cat_hash(cat))
items = _LLM_CACHE.get(key, [])
if items:
for item in items:
if not item.get("image_url"):
city_items.append(item)
if city_items:
by_city[city] = city_items
total_missing += len(city_items)
print(f'{city}: {len(city_items)} items missing images')
print(f'\nTotal items missing images: {total_missing}')
# Enrich each city's items in parallel (6 workers per batch)
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as pool:
futures = {}
for city, items in by_city.items():
f = pool.submit(_enrich_with_images, items, city=city)
futures[f] = city
for f in concurrent.futures.as_completed(futures):
city = futures[f]
try:
result = f.result()
fixed = sum(1 for it in result if it.get("image_url"))
print(f' {city}: fixed {fixed}/{len(by_city[city])} remaining')
except Exception as e:
print(f' {city}: error - {e}')
_save_image_cache()
# Final tally
still_missing = sum(1 for v in _LLM_CACHE.values() if v for it in v if not it.get("image_url"))
print(f'\nStill missing after fix: {still_missing} (from {total_missing})')
print(f'Image cache entries: {len(_IMAGE_CACHE)}')
|