cascade_risk / src /data /location.py
Lucasoppem's picture
Sync from GitHub main (part 2)
36f9d47 verified
Raw
History Blame Contribute Delete
2.02 kB
"""Normalize EM-DAT location field into a single short search token.
The EM-DAT `location` field is inconsistent — sometimes a single city
("Valencia Province"), sometimes a 10-item list of tiny administrative units
("Gorenjska, Goriska, Jugovzodna Slovenija, Koroska, ..."), sometimes null.
Neither raw form is directly usable in a news search query.
This module wraps a single LLM call that returns ONE token suitable for
dropping into a GDELT query. When the location is too diffuse or absent,
it falls back to the country name.
"""
from __future__ import annotations
import logging
import re
from src.llm.client import LLMClient
from src.models.schemas import FloodEvent
logger = logging.getLogger(__name__)
def extract_location_token(
event: FloodEvent, llm_client: LLMClient, config: dict
) -> str:
"""Return a single place token for use in news queries.
Falls back to the country name on any LLM failure or empty output.
"""
variables = {
"country": event.country,
"location": event.location or "null",
}
try:
response = llm_client.call_with_config(
prompt_key="location_extract",
knowledge_key="search", # reuse expert_search as knowledge placeholder
variables=variables,
config=config,
)
except Exception as e:
logger.warning(f"Location extraction failed for {event.event_id}: {e}")
return event.country
# Take first non-empty line, strip quotes/punctuation
token = ""
for line in response.splitlines():
s = line.strip().strip('"').strip("'").strip()
if s:
token = s
break
# Sanity checks: must be short, alphabetic-ish, not empty
if not token or len(token) > 50 or not re.search(r"[A-Za-z]", token):
logger.warning(
f"Location extraction produced invalid token '{token}' for {event.event_id}; "
f"falling back to country"
)
return event.country
return token