wanderlust-chatbot / scripts /processors /normalize_destinations.py
Kiriten892's picture
feat: security audit fixes, performance improvements & global data pipeline
dea44a6
Raw
History Blame Contribute Delete
4.48 kB
"""
Enrich destinations.json with scraped attraction data.
Fills in popular_activities for destinations that currently have none.
"""
import json
import logging
from typing import Optional
from scripts.scrapers.config import PATHS
logger = logging.getLogger(__name__)
ACTIVITY_TYPE_MAP = {
"museum": "sightseeing",
"attraction": "sightseeing",
"monument": "sightseeing",
"castle": "sightseeing",
"ruins": "sightseeing",
"viewpoint": "sightseeing",
"park": "nature",
"beach": "beach",
"theme_park": "entertainment",
"gallery": "culture",
"arts_centre": "culture",
"theatre": "culture",
"place_of_worship": "culture",
"touristattraction": "sightseeing",
"landmarksOrHistoricalBuildings": "sightseeing",
}
def normalize_activity(raw: dict) -> Optional[dict]:
"""Convert raw scraped attraction to popular_activities entry schema."""
name = (raw.get("name_en") or raw.get("name") or "").strip()
if not name or len(name) < 3:
return None
raw_type = (raw.get("type") or "attraction").lower()
activity_type = ACTIVITY_TYPE_MAP.get(raw_type, "sightseeing")
lat = raw.get("lat")
lon = raw.get("lon")
return {
"name": raw.get("name") or name,
"name_en": name,
"type": activity_type,
"cost_vnd": 0, # Unknown without deeper scraping
"duration_hours": 2, # Default estimate
"coordinates": {
"lat": float(lat) if lat else None,
"lon": float(lon) if lon else None,
},
"opening_hours": raw.get("opening_hours", "08:00-17:00"),
"time_from_center_min": 15,
"best_time": "morning",
"description_en": (raw.get("description") or "")[:200].strip(),
"_source": raw.get("source", "unknown"),
}
def enrich_destinations(scraped_cities: list) -> dict:
"""
For each scraped city, find its destination in destinations.json and
append new activities that aren't already listed.
Args:
scraped_cities: List of city dicts with "destination_id" and "attractions" keys.
Returns:
Stats dict.
"""
dest_path = PATHS["destinations_db"]
with open(dest_path, "r", encoding="utf-8") as f:
raw_data = json.load(f)
# Support both {"destinations": [...]} wrapper and bare list
if isinstance(raw_data, dict):
destinations = raw_data.get("destinations", [])
_wrapper = raw_data # preserve wrapper dict for writing back
_is_wrapped = True
else:
destinations = raw_data
_wrapper = None
_is_wrapped = False
# Build index by destination_id
dest_index = {d.get("id", ""): i for i, d in enumerate(destinations)}
enriched = 0
activities_added = 0
for city_data in scraped_cities:
dest_id = city_data.get("destination_id", "")
attractions = city_data.get("attractions", [])
if not dest_id or not attractions:
continue
idx = dest_index.get(dest_id)
if idx is None:
continue
dest = destinations[idx]
existing_activities = dest.get("popular_activities", [])
existing_names = {a.get("name_en", "").lower() for a in existing_activities}
added_for_city = 0
for raw in attractions:
activity = normalize_activity(raw)
if activity is None:
continue
if activity["name_en"].lower() in existing_names:
continue
existing_activities.append(activity)
existing_names.add(activity["name_en"].lower())
added_for_city += 1
if added_for_city > 0:
dest["popular_activities"] = existing_activities
destinations[idx] = dest
enriched += 1
activities_added += added_for_city
logger.info(f"Enriched {dest_id}: +{added_for_city} activities")
with open(dest_path, "w", encoding="utf-8") as f:
if _is_wrapped:
_wrapper["destinations"] = destinations
json.dump(_wrapper, f, ensure_ascii=False, indent=2)
else:
json.dump(destinations, f, ensure_ascii=False, indent=2)
stats = {"destinations_enriched": enriched, "activities_added": activities_added}
logger.info(f"Destination enrichment: {enriched} cities, +{activities_added} activities total")
return stats