Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 30

Commit

eb5677d

verified ·

1 Parent(s): 16b5d3f

Update scenario_engine.py

Browse files

Files changed (1) hide show

scenario_engine.py +117 -89

scenario_engine.py CHANGED Viewed

@@ -1,71 +1,110 @@
 # scenario_engine.py
-# scenario_engine.py
 from __future__ import annotations
 from typing import Dict, List, Any, Tuple, Optional, Iterable
 import re, math, ast
 import numpy as np
 import pandas as pd
-# Optional import from column_resolver.py (recommended).
-# If it's not available, we define light fallbacks so the engine still works.
 try:
-    from column_resolver import resolve_one, resolve_cols  # full resolver (headers + synonyms)
 except Exception:
-    # ---- Minimal, schema-agnostic fallback (headers-only; safe, no hard-coding) ----
-    _ROLE_SYNONYMS = {
-        "facility": ["facility", "hospital", "centre", "center", "clinic", "site", "provider",
-                     "settlement", "community", "location"],
-        "community": ["community", "settlement", "reserve", "town", "village", "city", "region", "area"],
-        "zone": ["zone", "region", "district", "area", "healthzone"],
-        "specialty": ["specialty", "programme", "program", "service", "discipline", "department"],
-        "period": ["period", "quarter", "year", "month", "time", "fiscal", "date"],
-        "city": ["city", "town", "village"],
-        "lat": ["latitude", "lat"],
-        "lon": ["longitude", "lon", "lng"],
-        "population": ["population", "members", "residents", "census"],
-        "prevalence": ["prevalence", "rate", "risk", "pct", "percentage"],
-        "volume": ["count", "visits", "clients", "volume", "n", "cases"],
-        "cost": ["cost", "expense", "spend", "budget", "perclient", "startup"],
-        "capacity": ["capacity", "throughput", "slots", "dailycapacity", "clientsperday"],
-    }
-    def _canon(s: str) -> str:
-        return re.sub(r"[^a-z0-9]+", "", (s or "").lower())
-    def resolve_one(want: str, columns: Iterable[str]) -> Optional[str]:
-        cols = list(columns or [])
-        if not cols:
-            return None
-        w = _canon(want or "")
-        if not w:
-            return None
-        canon_cols = { _canon(c): c for c in cols }
-        if w in canon_cols:
-            return canon_cols[w]
-        syns = _ROLE_SYNONYMS.get(want.lower(), [])
-        syns_canon = [_canon(s) for s in syns]
-        # Try synonyms exact/startswith/contains
-        best, score = None, -1
-        for c in cols:
-            cc = _canon(c)
-            sc = 0
-            if w and (cc == w or cc.startswith(w) or w in cc): sc += 3
-            for s in syns_canon:
-                if cc == s: sc += 5
-                elif cc.startswith(s): sc += 3
-                elif s in cc: sc += 2
-            if sc > score:
-                best, score = c, sc
-        return best if score >= 2 else None
-    def resolve_cols(requested: Iterable[str], columns: Iterable[str]) -> List[str]:
-        out, seen = [], set()
-        for r in requested or []:
-            col = resolve_one(r, columns)
-            if col and col not in seen:
-                out.append(col); seen.add(col)
-        return out
-# ---------- Safe expression evaluation (filters/derivations) ----------
 _ALLOWED_FUNCS = {
     "abs": abs, "round": round,
     "sqrt": np.sqrt, "log": np.log, "exp": np.exp,
@@ -114,9 +153,9 @@ def _eval_series_expr(expr: str, df: pd.DataFrame) -> pd.Series:
         return pd.Series(val, index=df.index)
     if isinstance(val, (bool, np.bool_)):
         return pd.Series([val] * len(df), index=df.index)
-    raise ValueError("Filter/derive expression must yield a vector or boolean")
-# ---------- Helpers ----------
 def _as_df(v: Any) -> Optional[pd.DataFrame]:
     if isinstance(v, pd.DataFrame):
         return v
@@ -136,13 +175,10 @@ def _get_df(datasets: Dict[str, Any], key: Optional[str]) -> Optional[pd.DataFra
 def _auto_group_cols(df: pd.DataFrame) -> List[str]:
     prefs = ["facility","community","settlement","provider","zone","region","district","specialty","program","service","city"]
-    resolved = []
     for p in prefs:
-        col = resolve_one(p, df.columns)
-        if col and col not in resolved:
-            resolved.append(col)
-    if resolved:
-        return [resolved[0]]
     obj_cols = [c for c in df.columns if df[c].dtype == "object"]
     return obj_cols[:1] if obj_cols else []
@@ -219,7 +255,6 @@ def _small_n_flags(df: pd.DataFrame, count_col: Optional[str] = None, threshold:
         return None
     if count_col and count_col in df.columns:
         return df[count_col].apply(lambda n: " (interpret cautiously: small n)" if pd.notnull(n) and float(n) < threshold else "")
-    # Fallback if no explicit count column—don’t guess
     return None
 def _missingness(df: pd.DataFrame, metric_cols: List[str]) -> List[str]:
@@ -231,7 +266,7 @@ def _missingness(df: pd.DataFrame, metric_cols: List[str]) -> List[str]:
                 notes.append(f"{c}: missing {miss:.1%}")
     return notes
-# ---------- Scenario Engine ----------
 class ScenarioEngine:
     """
     Execute a ScenarioPlan (or dict) consisting of tasks that specify:
@@ -256,10 +291,9 @@ class ScenarioEngine:
                    mapping_log: List[str]) -> pd.DataFrame:
         # Resolve grouping to existing columns; tolerate roles or wrong names
         if group_by:
-            gcols = resolve_cols(group_by, df.columns)
-            # log role->actual for transparency
             for want in (group_by or []):
-                got = resolve_one(want, df.columns)
                 mapping_log.append(f"group_by: {want} → {got if got else '(unresolved)'}")
         else:
             gcols = _auto_group_cols(df)
@@ -268,13 +302,12 @@ class ScenarioEngine:
             else:
                 mapping_log.append("group_by: (auto) → (none)")
-        # If no grouping and no aggregations → return df as-is (trim wide frames)
         aggs = _parse_aggs(agg_spec or "")
         if not gcols:
             if not aggs:
-                # Keep a reasonable view: first 50 rows
                 return df.head(50).copy()
-            # global aggregate row
             rec = { out_col: _apply_agg_call(df, call) for out_col, call in aggs }
             return pd.DataFrame([rec])
@@ -306,9 +339,9 @@ class ScenarioEngine:
                         mapping_log: List[str]) -> pd.DataFrame:
         if not isinstance(out_df, pd.DataFrame) or out_df.empty or not fields:
             return out_df
-        cols = resolve_cols(fields, out_df.columns)
         for want in fields:
-            got = resolve_one(want, out_df.columns)
             mapping_log.append(f"field: {want} → {got if got else '(unresolved)'}")
         if cols:
             return out_df[cols]
@@ -336,7 +369,6 @@ class ScenarioEngine:
     @staticmethod
     def _exec_task(t: Any, datasets: Dict[str, Any]) -> str:
-        # tolerate dict-like tasks or dataclass
         title = getattr(t, "title", None) or (isinstance(t, dict) and t.get("title")) or "Task"
         section_lines: List[str] = [f"## {title}\n"]
@@ -346,7 +378,7 @@ class ScenarioEngine:
             section_lines.append("_No matching data for this task._")
             return "\n".join(section_lines)
-        # Optional filter(s)
         t_filter = getattr(t, "filter", None) or (isinstance(t, dict) and t.get("filter"))
         if t_filter:
             try:
@@ -354,7 +386,7 @@ class ScenarioEngine:
             except Exception as e:
                 section_lines.append(f"_Warning: filter ignored ({e})._")
-        # Optional derive(s)
         t_derive = getattr(t, "derive", None) or (isinstance(t, dict) and t.get("derive"))
         if t_derive:
             for d in (t_derive if isinstance(t_derive, (list, tuple)) else [t_derive]):
@@ -363,16 +395,12 @@ class ScenarioEngine:
                 except Exception as e:
                     section_lines.append(f"_Warning: derive ignored ({e})._")
-        # Group/Aggregate
         t_group_by = getattr(t, "group_by", None) or (isinstance(t, dict) and t.get("group_by"))
-        # allow single string in plans
         if isinstance(t_group_by, str):
             t_group_by = [t_group_by]
         t_agg = getattr(t, "agg", None) or (isinstance(t, dict) and t.get("agg"))
-        if isinstance(t_agg, list):
-            agg_spec = ", ".join(t_agg)
-        else:
-            agg_spec = (t_agg or None)
         mapping_log: List[str] = []
         out_df = ScenarioEngine._group_agg(df, t_group_by, agg_spec, mapping_log)
@@ -380,11 +408,11 @@ class ScenarioEngine:
         # Sort / Top
         t_sort_by = getattr(t, "sort_by", None) or (isinstance(t, dict) and t.get("sort_by"))
         t_sort_dir = (getattr(t, "sort_dir", None) or (isinstance(t, dict) and t.get("sort_dir")) or "desc").lower()
-        if t_sort_by and isinstance(out_df, pd.DataFrame) and t_sort_by in out_df.columns:
             out_df = out_df.sort_values(t_sort_by, ascending=(t_sort_dir=="asc"))
         t_top = getattr(t, "top", None) or (isinstance(t, dict) and t.get("top"))
-        if isinstance(t_top, int) and t_top > 0 and isinstance(out_df, pd.DataFrame):
             out_df = out_df.head(t_top)
         # Field projection
@@ -393,7 +421,7 @@ class ScenarioEngine:
             t_fields = [t_fields]
         out_df = ScenarioEngine._project_fields(out_df, t_fields, mapping_log)
-        # Render table
         section_lines.append(_render_table(out_df))
         # Assumptions & Mappings

 # scenario_engine.py
 from __future__ import annotations
 from typing import Dict, List, Any, Tuple, Optional, Iterable
 import re, math, ast
 import numpy as np
 import pandas as pd
+# ========= Robust role/column resolver (safe with pandas.Index) =========
 try:
+    # If you have an external, richer resolver, we will use it automatically.
+    from column_resolver import resolve_one as _ext_resolve_one, resolve_cols as _ext_resolve_cols  # type: ignore
+    _HAS_EXT_RESOLVER = True
 except Exception:
+    _HAS_EXT_RESOLVER = False
+_ROLE_SYNONYMS_FALLBACK = {
+    "facility": ["facility", "hospital", "centre", "center", "clinic", "site", "provider",
+                 "settlement", "community", "location"],
+    "community": ["community", "settlement", "reserve", "town", "village", "city", "region", "area"],
+    "zone": ["zone", "region", "district", "area", "healthzone"],
+    "specialty": ["specialty", "programme", "program", "service", "discipline", "department"],
+    "period": ["period", "quarter", "year", "month", "time", "fiscal", "date"],
+    "city": ["city", "town", "village"],
+    "lat": ["latitude", "lat"],
+    "lon": ["longitude", "lon", "lng"],
+    "population": ["population", "members", "residents", "census"],
+    "prevalence": ["prevalence", "rate", "risk", "pct", "percentage"],
+    "volume": ["count", "visits", "clients", "volume", "n", "cases"],
+    "cost": ["cost", "expense", "spend", "budget", "perclient", "startup"],
+    "capacity": ["capacity", "throughput", "slots", "dailycapacity", "clientsperday"],
+}
+def _canon(s: str) -> str:
+    return re.sub(r"[^a-z0-9]+", "", (s or "").lower())
+def _to_list(x: Iterable | None) -> List:
+    if x is None:
+        return []
+    try:
+        return list(x)
+    except Exception:
+        return [x]
+def resolve_one(want: str, columns: Iterable[str]) -> Optional[str]:
+    """Return best matching column for a semantic role or exact header. Safe for pandas.Index."""
+    cols = _to_list(columns)
+    if _HAS_EXT_RESOLVER:
+        try:
+            return _ext_resolve_one(want, cols)
+        except Exception:
+            pass
+    if not cols:
+        return None
+    wcanon = _canon(want)
+    if not wcanon:
+        return None
+    canon_cols = { _canon(c): c for c in cols if isinstance(c, str) }
+    if wcanon in canon_cols:
+        return canon_cols[wcanon]
+    syns = _ROLE_SYNONYMS_FALLBACK.get((want or "").lower(), [])
+    syns_canon = [_canon(s) for s in syns]
+    best, score = None, -1
+    for c in cols:
+        if not isinstance(c, str):
+            continue
+        cc = _canon(c)
+        sc = 0
+        if wcanon and (cc == wcanon or cc.startswith(wcanon) or wcanon in cc):
+            sc += 3
+        for s in syns_canon:
+            if not s:
+                continue
+            if cc == s:
+                sc += 5
+            elif cc.startswith(s):
+                sc += 3
+            elif s in cc:
+                sc += 2
+        if sc > score:
+            best, score = c, sc
+    return best if score >= 2 else None
+def resolve_cols(requested: Iterable[str], columns: Iterable[str]) -> List[str]:
+    """Resolve a list of roles/headers to existing columns, uniquely. Safe for pandas.Index."""
+    reqs = _to_list(requested)
+    cols = _to_list(columns)
+    if _HAS_EXT_RESOLVER:
+        try:
+            return _ext_resolve_cols(reqs, cols)
+        except Exception:
+            pass
+    out, seen = [], set()
+    for r in reqs:
+        col = resolve_one(r, cols)
+        if col and col not in seen:
+            out.append(col)
+            seen.add(col)
+    return out
+# ========= Safe expression evaluation (filters/derivations) =========
 _ALLOWED_FUNCS = {
     "abs": abs, "round": round,
     "sqrt": np.sqrt, "log": np.log, "exp": np.exp,
         return pd.Series(val, index=df.index)
     if isinstance(val, (bool, np.bool_)):
         return pd.Series([val] * len(df), index=df.index)
+    raise ValueError("Expression must yield a vector or boolean")
+# ========= Helpers =========
 def _as_df(v: Any) -> Optional[pd.DataFrame]:
     if isinstance(v, pd.DataFrame):
         return v
 def _auto_group_cols(df: pd.DataFrame) -> List[str]:
     prefs = ["facility","community","settlement","provider","zone","region","district","specialty","program","service","city"]
     for p in prefs:
+        col = resolve_one(p, _to_list(df.columns))
+        if col:
+            return [col]
     obj_cols = [c for c in df.columns if df[c].dtype == "object"]
     return obj_cols[:1] if obj_cols else []
         return None
     if count_col and count_col in df.columns:
         return df[count_col].apply(lambda n: " (interpret cautiously: small n)" if pd.notnull(n) and float(n) < threshold else "")
     return None
 def _missingness(df: pd.DataFrame, metric_cols: List[str]) -> List[str]:
                 notes.append(f"{c}: missing {miss:.1%}")
     return notes
+# ========= Scenario Engine =========
 class ScenarioEngine:
     """
     Execute a ScenarioPlan (or dict) consisting of tasks that specify:
                    mapping_log: List[str]) -> pd.DataFrame:
         # Resolve grouping to existing columns; tolerate roles or wrong names
         if group_by:
+            gcols = resolve_cols(group_by, _to_list(df.columns))
             for want in (group_by or []):
+                got = resolve_one(want, _to_list(df.columns))
                 mapping_log.append(f"group_by: {want} → {got if got else '(unresolved)'}")
         else:
             gcols = _auto_group_cols(df)
             else:
                 mapping_log.append("group_by: (auto) → (none)")
         aggs = _parse_aggs(agg_spec or "")
+        # No grouping & no agg => just preview a slice
         if not gcols:
             if not aggs:
                 return df.head(50).copy()
             rec = { out_col: _apply_agg_call(df, call) for out_col, call in aggs }
             return pd.DataFrame([rec])
                         mapping_log: List[str]) -> pd.DataFrame:
         if not isinstance(out_df, pd.DataFrame) or out_df.empty or not fields:
             return out_df
+        cols = resolve_cols(fields, _to_list(out_df.columns))
         for want in fields:
+            got = resolve_one(want, _to_list(out_df.columns))
             mapping_log.append(f"field: {want} → {got if got else '(unresolved)'}")
         if cols:
             return out_df[cols]
     @staticmethod
     def _exec_task(t: Any, datasets: Dict[str, Any]) -> str:
         title = getattr(t, "title", None) or (isinstance(t, dict) and t.get("title")) or "Task"
         section_lines: List[str] = [f"## {title}\n"]
             section_lines.append("_No matching data for this task._")
             return "\n".join(section_lines)
+        # Filter(s)
         t_filter = getattr(t, "filter", None) or (isinstance(t, dict) and t.get("filter"))
         if t_filter:
             try:
             except Exception as e:
                 section_lines.append(f"_Warning: filter ignored ({e})._")
+        # Derive(s)
         t_derive = getattr(t, "derive", None) or (isinstance(t, dict) and t.get("derive"))
         if t_derive:
             for d in (t_derive if isinstance(t_derive, (list, tuple)) else [t_derive]):
                 except Exception as e:
                     section_lines.append(f"_Warning: derive ignored ({e})._")
+        # Group/Agg
         t_group_by = getattr(t, "group_by", None) or (isinstance(t, dict) and t.get("group_by"))
         if isinstance(t_group_by, str):
             t_group_by = [t_group_by]
         t_agg = getattr(t, "agg", None) or (isinstance(t, dict) and t.get("agg"))
+        agg_spec = ", ".join(t_agg) if isinstance(t_agg, list) else (t_agg or None)
         mapping_log: List[str] = []
         out_df = ScenarioEngine._group_agg(df, t_group_by, agg_spec, mapping_log)
         # Sort / Top
         t_sort_by = getattr(t, "sort_by", None) or (isinstance(t, dict) and t.get("sort_by"))
         t_sort_dir = (getattr(t, "sort_dir", None) or (isinstance(t, dict) and t.get("sort_dir")) or "desc").lower()
+        if isinstance(out_df, pd.DataFrame) and t_sort_by and t_sort_by in out_df.columns:
             out_df = out_df.sort_values(t_sort_by, ascending=(t_sort_dir=="asc"))
         t_top = getattr(t, "top", None) or (isinstance(t, dict) and t.get("top"))
+        if isinstance(out_df, pd.DataFrame) and isinstance(t_top, int) and t_top > 0:
             out_df = out_df.head(t_top)
         # Field projection
             t_fields = [t_fields]
         out_df = ScenarioEngine._project_fields(out_df, t_fields, mapping_log)
+        # Render
         section_lines.append(_render_table(out_df))
         # Assumptions & Mappings