"""Normalize EM-DAT location field into a single short search token. The EM-DAT `location` field is inconsistent — sometimes a single city ("Valencia Province"), sometimes a 10-item list of tiny administrative units ("Gorenjska, Goriska, Jugovzodna Slovenija, Koroska, ..."), sometimes null. Neither raw form is directly usable in a news search query. This module wraps a single LLM call that returns ONE token suitable for dropping into a GDELT query. When the location is too diffuse or absent, it falls back to the country name. """ from __future__ import annotations import logging import re from src.llm.client import LLMClient from src.models.schemas import FloodEvent logger = logging.getLogger(__name__) def extract_location_token( event: FloodEvent, llm_client: LLMClient, config: dict ) -> str: """Return a single place token for use in news queries. Falls back to the country name on any LLM failure or empty output. """ variables = { "country": event.country, "location": event.location or "null", } try: response = llm_client.call_with_config( prompt_key="location_extract", knowledge_key="search", # reuse expert_search as knowledge placeholder variables=variables, config=config, ) except Exception as e: logger.warning(f"Location extraction failed for {event.event_id}: {e}") return event.country # Take first non-empty line, strip quotes/punctuation token = "" for line in response.splitlines(): s = line.strip().strip('"').strip("'").strip() if s: token = s break # Sanity checks: must be short, alphabetic-ish, not empty if not token or len(token) > 50 or not re.search(r"[A-Za-z]", token): logger.warning( f"Location extraction produced invalid token '{token}' for {event.event_id}; " f"falling back to country" ) return event.country return token