Spaces:
Sleeping
Sleeping
Rajan Sharma
commited on
Create schema_mapper.py
Browse files- schema_mapper.py +107 -0
schema_mapper.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import re
|
| 3 |
+
from dataclasses import dataclass, field
|
| 4 |
+
from typing import Dict, List, Any, Tuple, Optional
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from data_registry import DataRegistry
|
| 7 |
+
|
| 8 |
+
CONCEPT_HINTS = {
|
| 9 |
+
"facility": [r"\bfacilit(y|ies)\b", r"\bhospital\b", r"\bsite\b", r"\bcentre\b", r"\bcenter\b"],
|
| 10 |
+
"specialty": [r"\bspecialt(y|ies)\b", r"\bservice\b", r"\bdepartment\b"],
|
| 11 |
+
"zone": [r"\bzone\b", r"\bregion\b", r"\bhealth zone\b"],
|
| 12 |
+
"wait_median": [r"\bmedian\b.*\bwait", r"\bP50\b.*\bwait", r"\bwait.*\bmedian"],
|
| 13 |
+
"wait_p90": [r"\bp90\b.*\bwait", r"\b90(th)? percentile\b.*\bwait", r"\bwait.*p90"],
|
| 14 |
+
"wait_days": [r"\bwait\b.*\bdays?\b", r"\bdays?\b.*\bwait\b"],
|
| 15 |
+
"capacity_beds": [r"\bstaffed\b.*\bbeds?\b", r"\bbeds?\b"],
|
| 16 |
+
"cost_fixed": [r"\bfixed\b.*\bcost", r"\bstartup\b.*\bcost"],
|
| 17 |
+
"cost_variable": [r"\bvariable\b.*\bcost", r"\bcost.*per\b(client|case|visit)\b"],
|
| 18 |
+
"clients_per_day": [r"\bclients?\b.*\bday\b", r"\bper[-_\s]?day\b.*clients?"],
|
| 19 |
+
"teams": [r"\bteams?\b", r"\bscreen(ing)? team\b"],
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
def _score_col(col_name: str, patterns: List[str]) -> int:
|
| 23 |
+
c = col_name.lower()
|
| 24 |
+
for i, pat in enumerate(patterns):
|
| 25 |
+
if re.search(pat, c):
|
| 26 |
+
return 100 - i
|
| 27 |
+
return 0
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class MappingResult:
|
| 31 |
+
resolved: Dict[str, Tuple[str, str]] = field(default_factory=dict)
|
| 32 |
+
ambiguous: Dict[str, List[Tuple[str, str]]] = field(default_factory=dict)
|
| 33 |
+
missing: List[str] = field(default_factory=list)
|
| 34 |
+
|
| 35 |
+
def map_concepts(scenario_text: str, registry: DataRegistry) -> MappingResult:
|
| 36 |
+
result = MappingResult()
|
| 37 |
+
if not registry.names():
|
| 38 |
+
result.missing = list(CONCEPT_HINTS.keys())
|
| 39 |
+
return result
|
| 40 |
+
|
| 41 |
+
all_cols = []
|
| 42 |
+
for t in registry.iter_tables():
|
| 43 |
+
for c in t.df.columns:
|
| 44 |
+
all_cols.append((t.name, str(c)))
|
| 45 |
+
|
| 46 |
+
for concept, patterns in CONCEPT_HINTS.items():
|
| 47 |
+
scores = [(((tbl, col)), _score_col(col, patterns)) for (tbl, col) in all_cols]
|
| 48 |
+
scores.sort(key=lambda x: x[1], reverse=True)
|
| 49 |
+
if not scores or scores[0][1] == 0:
|
| 50 |
+
result.missing.append(concept)
|
| 51 |
+
continue
|
| 52 |
+
top_score = scores[0][1]
|
| 53 |
+
near = [pair for pair, s in scores if s >= max(50, top_score - 5)]
|
| 54 |
+
if len(near) == 1:
|
| 55 |
+
tbl, col = near[0]
|
| 56 |
+
result.resolved[concept] = (tbl, col)
|
| 57 |
+
else:
|
| 58 |
+
result.ambiguous[concept] = near
|
| 59 |
+
return result
|
| 60 |
+
|
| 61 |
+
def build_phase1_questions(scenario_text: str, registry: DataRegistry, mapping: MappingResult, max_groups: int = 5) -> str:
|
| 62 |
+
groups: List[Tuple[str, str]] = []
|
| 63 |
+
|
| 64 |
+
def _ask_disamb(concept: str, pairs: List[Tuple[str, str]], group: str, lead: str):
|
| 65 |
+
opts = "; ".join([f"{t}.{c}" for t, c in pairs[:6]])
|
| 66 |
+
groups.append((group, f"{lead} **Which column matches** `{concept}`? Options: {opts}"))
|
| 67 |
+
|
| 68 |
+
if "facility" in mapping.ambiguous:
|
| 69 |
+
_ask_disamb("facility", mapping.ambiguous["facility"], "Prioritization", "We need the facility identifier to aggregate by site.")
|
| 70 |
+
elif "facility" in mapping.missing:
|
| 71 |
+
groups.append(("Prioritization", "Provide the **facility/site** column to group results (name or ID)."))
|
| 72 |
+
|
| 73 |
+
if "specialty" in mapping.ambiguous:
|
| 74 |
+
_ask_disamb("specialty", mapping.ambiguous["specialty"], "Prioritization", "We need the specialty/service field to rank by specialty.")
|
| 75 |
+
elif "specialty" in mapping.missing:
|
| 76 |
+
groups.append(("Prioritization", "Provide the **specialty/service** column (e.g., General Surgery, Ortho)."))
|
| 77 |
+
|
| 78 |
+
if "capacity_beds" in mapping.ambiguous:
|
| 79 |
+
_ask_disamb("capacity_beds", mapping.ambiguous["capacity_beds"], "Capacity", "To estimate staffed capacity, confirm the **staffed beds** column.")
|
| 80 |
+
elif "capacity_beds" in mapping.missing:
|
| 81 |
+
groups.append(("Capacity", "Provide **staffed beds** (or equivalent capacity) column."))
|
| 82 |
+
|
| 83 |
+
if "clients_per_day" in mapping.missing:
|
| 84 |
+
groups.append(("Capacity", "What is the **clients per day** rate per team?"))
|
| 85 |
+
if "teams" in mapping.missing:
|
| 86 |
+
groups.append(("Capacity", "How many **teams** operate concurrently?"))
|
| 87 |
+
|
| 88 |
+
if "cost_fixed" in mapping.missing:
|
| 89 |
+
groups.append(("Cost", "Provide **fixed/startup cost** (or confirm none)."))
|
| 90 |
+
if "cost_variable" in mapping.missing:
|
| 91 |
+
groups.append(("Cost", "Provide **variable cost per client/case** (or unit definition)."))
|
| 92 |
+
|
| 93 |
+
any_wait = ("wait_median" in mapping.resolved) or ("wait_p90" in mapping.resolved) or ("wait_days" in mapping.resolved)
|
| 94 |
+
if not any_wait:
|
| 95 |
+
groups.append(("Clinical", "Which columns capture **wait times** (median/p90 or days)?"))
|
| 96 |
+
|
| 97 |
+
groups.append(("Recommendations", "Any operational constraints or equity priorities we should encode (scheduling limits, rural access, partnerships)?"))
|
| 98 |
+
|
| 99 |
+
groups = groups[:max_groups]
|
| 100 |
+
out = ["**Clarification Questions**"]
|
| 101 |
+
cur = None
|
| 102 |
+
for grp, q in groups:
|
| 103 |
+
if grp != cur:
|
| 104 |
+
out.append(f"\n**{grp}:**")
|
| 105 |
+
cur = grp
|
| 106 |
+
out.append(f"- {q}")
|
| 107 |
+
return "\n".join(out)
|