""" Enrich destinations.json with scraped attraction data. Fills in popular_activities for destinations that currently have none. """ import json import logging from typing import Optional from scripts.scrapers.config import PATHS logger = logging.getLogger(__name__) ACTIVITY_TYPE_MAP = { "museum": "sightseeing", "attraction": "sightseeing", "monument": "sightseeing", "castle": "sightseeing", "ruins": "sightseeing", "viewpoint": "sightseeing", "park": "nature", "beach": "beach", "theme_park": "entertainment", "gallery": "culture", "arts_centre": "culture", "theatre": "culture", "place_of_worship": "culture", "touristattraction": "sightseeing", "landmarksOrHistoricalBuildings": "sightseeing", } def normalize_activity(raw: dict) -> Optional[dict]: """Convert raw scraped attraction to popular_activities entry schema.""" name = (raw.get("name_en") or raw.get("name") or "").strip() if not name or len(name) < 3: return None raw_type = (raw.get("type") or "attraction").lower() activity_type = ACTIVITY_TYPE_MAP.get(raw_type, "sightseeing") lat = raw.get("lat") lon = raw.get("lon") return { "name": raw.get("name") or name, "name_en": name, "type": activity_type, "cost_vnd": 0, # Unknown without deeper scraping "duration_hours": 2, # Default estimate "coordinates": { "lat": float(lat) if lat else None, "lon": float(lon) if lon else None, }, "opening_hours": raw.get("opening_hours", "08:00-17:00"), "time_from_center_min": 15, "best_time": "morning", "description_en": (raw.get("description") or "")[:200].strip(), "_source": raw.get("source", "unknown"), } def enrich_destinations(scraped_cities: list) -> dict: """ For each scraped city, find its destination in destinations.json and append new activities that aren't already listed. Args: scraped_cities: List of city dicts with "destination_id" and "attractions" keys. Returns: Stats dict. """ dest_path = PATHS["destinations_db"] with open(dest_path, "r", encoding="utf-8") as f: raw_data = json.load(f) # Support both {"destinations": [...]} wrapper and bare list if isinstance(raw_data, dict): destinations = raw_data.get("destinations", []) _wrapper = raw_data # preserve wrapper dict for writing back _is_wrapped = True else: destinations = raw_data _wrapper = None _is_wrapped = False # Build index by destination_id dest_index = {d.get("id", ""): i for i, d in enumerate(destinations)} enriched = 0 activities_added = 0 for city_data in scraped_cities: dest_id = city_data.get("destination_id", "") attractions = city_data.get("attractions", []) if not dest_id or not attractions: continue idx = dest_index.get(dest_id) if idx is None: continue dest = destinations[idx] existing_activities = dest.get("popular_activities", []) existing_names = {a.get("name_en", "").lower() for a in existing_activities} added_for_city = 0 for raw in attractions: activity = normalize_activity(raw) if activity is None: continue if activity["name_en"].lower() in existing_names: continue existing_activities.append(activity) existing_names.add(activity["name_en"].lower()) added_for_city += 1 if added_for_city > 0: dest["popular_activities"] = existing_activities destinations[idx] = dest enriched += 1 activities_added += added_for_city logger.info(f"Enriched {dest_id}: +{added_for_city} activities") with open(dest_path, "w", encoding="utf-8") as f: if _is_wrapped: _wrapper["destinations"] = destinations json.dump(_wrapper, f, ensure_ascii=False, indent=2) else: json.dump(destinations, f, ensure_ascii=False, indent=2) stats = {"destinations_enriched": enriched, "activities_added": activities_added} logger.info(f"Destination enrichment: {enriched} cities, +{activities_added} activities total") return stats