Spaces:
Sleeping
Sleeping
| """ | |
| Enrich destinations.json with scraped attraction data. | |
| Fills in popular_activities for destinations that currently have none. | |
| """ | |
| import json | |
| import logging | |
| from typing import Optional | |
| from scripts.scrapers.config import PATHS | |
| logger = logging.getLogger(__name__) | |
| ACTIVITY_TYPE_MAP = { | |
| "museum": "sightseeing", | |
| "attraction": "sightseeing", | |
| "monument": "sightseeing", | |
| "castle": "sightseeing", | |
| "ruins": "sightseeing", | |
| "viewpoint": "sightseeing", | |
| "park": "nature", | |
| "beach": "beach", | |
| "theme_park": "entertainment", | |
| "gallery": "culture", | |
| "arts_centre": "culture", | |
| "theatre": "culture", | |
| "place_of_worship": "culture", | |
| "touristattraction": "sightseeing", | |
| "landmarksOrHistoricalBuildings": "sightseeing", | |
| } | |
| def normalize_activity(raw: dict) -> Optional[dict]: | |
| """Convert raw scraped attraction to popular_activities entry schema.""" | |
| name = (raw.get("name_en") or raw.get("name") or "").strip() | |
| if not name or len(name) < 3: | |
| return None | |
| raw_type = (raw.get("type") or "attraction").lower() | |
| activity_type = ACTIVITY_TYPE_MAP.get(raw_type, "sightseeing") | |
| lat = raw.get("lat") | |
| lon = raw.get("lon") | |
| return { | |
| "name": raw.get("name") or name, | |
| "name_en": name, | |
| "type": activity_type, | |
| "cost_vnd": 0, # Unknown without deeper scraping | |
| "duration_hours": 2, # Default estimate | |
| "coordinates": { | |
| "lat": float(lat) if lat else None, | |
| "lon": float(lon) if lon else None, | |
| }, | |
| "opening_hours": raw.get("opening_hours", "08:00-17:00"), | |
| "time_from_center_min": 15, | |
| "best_time": "morning", | |
| "description_en": (raw.get("description") or "")[:200].strip(), | |
| "_source": raw.get("source", "unknown"), | |
| } | |
| def enrich_destinations(scraped_cities: list) -> dict: | |
| """ | |
| For each scraped city, find its destination in destinations.json and | |
| append new activities that aren't already listed. | |
| Args: | |
| scraped_cities: List of city dicts with "destination_id" and "attractions" keys. | |
| Returns: | |
| Stats dict. | |
| """ | |
| dest_path = PATHS["destinations_db"] | |
| with open(dest_path, "r", encoding="utf-8") as f: | |
| raw_data = json.load(f) | |
| # Support both {"destinations": [...]} wrapper and bare list | |
| if isinstance(raw_data, dict): | |
| destinations = raw_data.get("destinations", []) | |
| _wrapper = raw_data # preserve wrapper dict for writing back | |
| _is_wrapped = True | |
| else: | |
| destinations = raw_data | |
| _wrapper = None | |
| _is_wrapped = False | |
| # Build index by destination_id | |
| dest_index = {d.get("id", ""): i for i, d in enumerate(destinations)} | |
| enriched = 0 | |
| activities_added = 0 | |
| for city_data in scraped_cities: | |
| dest_id = city_data.get("destination_id", "") | |
| attractions = city_data.get("attractions", []) | |
| if not dest_id or not attractions: | |
| continue | |
| idx = dest_index.get(dest_id) | |
| if idx is None: | |
| continue | |
| dest = destinations[idx] | |
| existing_activities = dest.get("popular_activities", []) | |
| existing_names = {a.get("name_en", "").lower() for a in existing_activities} | |
| added_for_city = 0 | |
| for raw in attractions: | |
| activity = normalize_activity(raw) | |
| if activity is None: | |
| continue | |
| if activity["name_en"].lower() in existing_names: | |
| continue | |
| existing_activities.append(activity) | |
| existing_names.add(activity["name_en"].lower()) | |
| added_for_city += 1 | |
| if added_for_city > 0: | |
| dest["popular_activities"] = existing_activities | |
| destinations[idx] = dest | |
| enriched += 1 | |
| activities_added += added_for_city | |
| logger.info(f"Enriched {dest_id}: +{added_for_city} activities") | |
| with open(dest_path, "w", encoding="utf-8") as f: | |
| if _is_wrapped: | |
| _wrapper["destinations"] = destinations | |
| json.dump(_wrapper, f, ensure_ascii=False, indent=2) | |
| else: | |
| json.dump(destinations, f, ensure_ascii=False, indent=2) | |
| stats = {"destinations_enriched": enriched, "activities_added": activities_added} | |
| logger.info(f"Destination enrichment: {enriched} cities, +{activities_added} activities total") | |
| return stats | |