Spaces:

iPurushottam
/

ClimAI

Running

File size: 8,706 Bytes

from date_utils import parse_date
import re
import json
import logging
from groq_llm import client as groq_client

KNOWN_CYCLONES = ["michaung", "mandous", "nivar", "gaja", "vardah", "thane", "nisha",
                  "fani", "amphan", "hudhud", "phailin", "laila", "jal"]

KNOWN_LOCATIONS = ["chennai", "mumbai", "kolkata", "vizag", "visakhapatnam",
                   "bay of bengal", "arabian sea", "tamil nadu", "andhra pradesh",
                   "odisha", "west bengal", "india", "puducherry", "cuddalore",
                   "nagapattinam", "mahabalipuram"]

def _normalize_query(q: str) -> str:
    typo_map = {
        r"\bpervious\b": "previous",
        r"\bprevios\b": "previous",
        r"\bpreviuos\b": "previous",
        r"\bprevioues\b": "previous",
        r"\bprevius\b": "previous",
        r"\bprevioius\b": "previous",
        r"\bhistorcal\b": "historical",
        r"\bhistoricle\b": "historical",
        r"\byesterady\b": "yesterday",
        r"\byestarday\b": "yesterday",
    }
    for pattern, replacement in typo_map.items():
        q = re.sub(pattern, replacement, q)
    return q


def _expand_disaster_intents(intents: list) -> list:
    """If disaster intent detected, always include weather, cyclone, earthquake."""
    if "disaster" in intents:
        for extra in ["weather", "cyclone", "earthquake", "tsunami"]:
            if extra not in intents:
                intents.append(extra)
    return intents


def classify_query(query: str):
    q = _normalize_query(query.lower().strip())
    intents = []

    past_kw = ["last year", "previous", "history", "historical", "ago", "past",
               "same date", "same day", "this day", "yesterday", "back in",
               "was", "were", "happened", "occurred", "hit", "struck", "recent"]
    future_kw = ["predict", "prediction", "next", "forecast", "tomorrow",
                 "coming", "upcoming", "expect", "will", "probability",
                 "chance", "future", "model", "ml", "ai"]

    is_past = any(re.search(rf"\b{k}\b", q) for k in past_kw)
    is_future = any(re.search(rf"\b{k}\b", q) for k in future_kw)

    current_year = __import__("datetime").datetime.now().year
    past_year_match = re.search(r'\b(19\d{2}|20\d{2})\b', q)
    if past_year_match and int(past_year_match.group(1)) < current_year:
        is_past = True
        is_future = False

    weather_kw = ["weather", "temperature", "temp", "hot", "cold", "rain", "wind", "humidity",
                  "climate", "heat", "sunny", "cloudy", "precipitation", "pressure",
                  "detail", "condition", "report"]
    if any(re.search(rf"\b{k}\b", q) for k in weather_kw):
        if is_past: intents.append("weather_history")
        elif is_future: intents.append("prediction")
        else: intents.append("weather")

    cyclone_kw = ["cyclone", "hurricane", "typhoon", "storm", "wind storm", "tropical",
                  "bay of bengal", "vardah", "nivar", "gaja", "mandous", "michaung",
                  "thane", "nisha", "fani", "amphan", "hudhud"]
    if any(re.search(rf"\b{k}\b", q) for k in cyclone_kw):
        if is_future: intents.append("cyclone_prediction")
        else: intents.append("cyclone")

    quake_kw = ["earthquake", "quake", "seismic", "magnitude", "richter", "tremor",
                "tectonic", "fault", "aftershock", "usgs"]
    if any(re.search(rf"\b{k}\b", q) for k in quake_kw):
        intents.append("earthquake")

    tsunami_kw = ["tsunami", "tidal wave", "ocean wave", "indian ocean", "sumatra",
                  "krakatoa", "sulawesi", "wave height"]
    if any(re.search(rf"\b{k}\b", q) for k in tsunami_kw):
        intents.append("tsunami")

    if not intents and is_future:
        intents.append("prediction")

    disaster_kw = ["disaster", "catastrophe", "calamity", "danger", "risk",
                   "overview", "summary", "all", "report", "threat", "alert"]
    if any(re.search(rf"\b{k}\b", q) for k in disaster_kw):
        intents.append("disaster")

    if "compare" in q or "difference" in q or re.search(r"\bvs\b", q) or "versus" in q:
        intents.append("weather_comparison")

    is_range = bool(re.search(r'\b(19\d{2}|20\d{2})\s*(?:to|-|and)\s*(19\d{2}|20\d{2})\b', q))
    if is_range and "weather_comparison" not in intents:
        intents.append("weather_comparison")

    if not intents:
        intents.append("weather")

    return _expand_disaster_intents(list(set(intents)))


def extract_query_context(query: str):
    q = _normalize_query(query.lower().strip())
    
    cyclone_name = None
    for name in KNOWN_CYCLONES:
        if name in q:
            cyclone_name = name
            break
            
    year = None
    m = re.search(r'(?<!\d)(?<!\d[-/])(19\d{2}|20\d{2})(?![-/]\d)(?!\d)', q)
    if m: year = int(m.group(1))
            
    location = None
    for loc in KNOWN_LOCATIONS:
        if loc in q:
            location = loc
            break
            
    wants_recent = any(k in q for k in ["recent", "latest", "last", "newest", "most recent"])
    wants_comparison = any(k in q for k in ["compare", "vs", "versus", "difference", "than"])

    is_range = bool(re.search(r'\b(19\d{2}|20\d{2})\s*(?:to|-|and)\s*(19\d{2}|20\d{2})\b', q))
    if is_range:
        wants_comparison = True

    return {
        "cyclone_name": cyclone_name,
        "year": year,
        "location": location,
        "wants_recent": wants_recent,
        "wants_comparison": wants_comparison
    }


def extract_intent_with_llm(query: str) -> dict:
    system_prompt = """You are an intent classifier for a climate and disaster tracking app.

    Given a user query, you must extract their intent and basic context.

    The query may contain severe typos or bad grammar. You must figure out what they mean.

    

    Allowed intents: weather, weather_history, weather_comparison, prediction, cyclone, cyclone_history, cyclone_prediction, earthquake, tsunami, disaster.

    

    Output exactly valid JSON in this format:

    {

        "intents": ["list", "of", "intents"],

        "context": {

            "cyclone_name": null,

            "year": null,

            "location": null,

            "wants_recent": false,

            "wants_comparison": false

        }

    }

    """
    try:
        response = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": f"Query: {query}"}
            ],
            response_format={"type": "json_object"},
            temperature=0.1,
            max_tokens=200
        )
        result = json.loads(response.choices[0].message.content)
        if "intents" not in result or "context" not in result:
            raise ValueError("LLM returned malformed JSON structure")
        return result
    except Exception as e:
        logging.error(f"LLM extraction failed: {e}")
        return None


def plan_query(query: str):
    """

    Create a deterministic execution plan, using LLM for typo-tolerant intent extraction.

    Falls back to regex parsing if the LLM fails.

    """
    # 1. Try LLM Extraction First
    llm_result = extract_intent_with_llm(query)

    if llm_result:
        intents = llm_result.get("intents", [])
        context = llm_result.get("context", {})
        # Safety fallback if LLM returns empty intents
        if not intents:
            intents = classify_query(query)
        else:
            # Always apply disaster expansion even for LLM results
            intents = _expand_disaster_intents(intents)
    else:
        # 2. Fallback to Regex
        logging.warning("Falling back to regex intent classification")
        intents = classify_query(query)
        context = extract_query_context(query)

    date_val = parse_date(query)

    # 3. Select the primary intent
    primary_intent = "weather"
    if "weather_comparison" in intents:
        primary_intent = "weather_comparison"
    elif "disaster" in intents:
        primary_intent = "disaster"
    elif "cyclone" in intents or "cyclone_history" in intents:
        primary_intent = "cyclone_history"
    elif "weather_history" in intents:
        primary_intent = "weather_history"
    else:
        primary_intent = intents[0] if intents else "unknown"

    return {
        "intent": primary_intent,
        "all_intents": intents,
        "date": date_val,
        "query": query,
        "context": context
    }