Rajan Sharma commited on
Commit
49f10c8
·
verified ·
1 Parent(s): aff5a07

Create schema_mapper.py

Browse files
Files changed (1) hide show
  1. schema_mapper.py +107 -0
schema_mapper.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import re
3
+ from dataclasses import dataclass, field
4
+ from typing import Dict, List, Any, Tuple, Optional
5
+ import pandas as pd
6
+ from data_registry import DataRegistry
7
+
8
+ CONCEPT_HINTS = {
9
+ "facility": [r"\bfacilit(y|ies)\b", r"\bhospital\b", r"\bsite\b", r"\bcentre\b", r"\bcenter\b"],
10
+ "specialty": [r"\bspecialt(y|ies)\b", r"\bservice\b", r"\bdepartment\b"],
11
+ "zone": [r"\bzone\b", r"\bregion\b", r"\bhealth zone\b"],
12
+ "wait_median": [r"\bmedian\b.*\bwait", r"\bP50\b.*\bwait", r"\bwait.*\bmedian"],
13
+ "wait_p90": [r"\bp90\b.*\bwait", r"\b90(th)? percentile\b.*\bwait", r"\bwait.*p90"],
14
+ "wait_days": [r"\bwait\b.*\bdays?\b", r"\bdays?\b.*\bwait\b"],
15
+ "capacity_beds": [r"\bstaffed\b.*\bbeds?\b", r"\bbeds?\b"],
16
+ "cost_fixed": [r"\bfixed\b.*\bcost", r"\bstartup\b.*\bcost"],
17
+ "cost_variable": [r"\bvariable\b.*\bcost", r"\bcost.*per\b(client|case|visit)\b"],
18
+ "clients_per_day": [r"\bclients?\b.*\bday\b", r"\bper[-_\s]?day\b.*clients?"],
19
+ "teams": [r"\bteams?\b", r"\bscreen(ing)? team\b"],
20
+ }
21
+
22
+ def _score_col(col_name: str, patterns: List[str]) -> int:
23
+ c = col_name.lower()
24
+ for i, pat in enumerate(patterns):
25
+ if re.search(pat, c):
26
+ return 100 - i
27
+ return 0
28
+
29
+ @dataclass
30
+ class MappingResult:
31
+ resolved: Dict[str, Tuple[str, str]] = field(default_factory=dict)
32
+ ambiguous: Dict[str, List[Tuple[str, str]]] = field(default_factory=dict)
33
+ missing: List[str] = field(default_factory=list)
34
+
35
+ def map_concepts(scenario_text: str, registry: DataRegistry) -> MappingResult:
36
+ result = MappingResult()
37
+ if not registry.names():
38
+ result.missing = list(CONCEPT_HINTS.keys())
39
+ return result
40
+
41
+ all_cols = []
42
+ for t in registry.iter_tables():
43
+ for c in t.df.columns:
44
+ all_cols.append((t.name, str(c)))
45
+
46
+ for concept, patterns in CONCEPT_HINTS.items():
47
+ scores = [(((tbl, col)), _score_col(col, patterns)) for (tbl, col) in all_cols]
48
+ scores.sort(key=lambda x: x[1], reverse=True)
49
+ if not scores or scores[0][1] == 0:
50
+ result.missing.append(concept)
51
+ continue
52
+ top_score = scores[0][1]
53
+ near = [pair for pair, s in scores if s >= max(50, top_score - 5)]
54
+ if len(near) == 1:
55
+ tbl, col = near[0]
56
+ result.resolved[concept] = (tbl, col)
57
+ else:
58
+ result.ambiguous[concept] = near
59
+ return result
60
+
61
+ def build_phase1_questions(scenario_text: str, registry: DataRegistry, mapping: MappingResult, max_groups: int = 5) -> str:
62
+ groups: List[Tuple[str, str]] = []
63
+
64
+ def _ask_disamb(concept: str, pairs: List[Tuple[str, str]], group: str, lead: str):
65
+ opts = "; ".join([f"{t}.{c}" for t, c in pairs[:6]])
66
+ groups.append((group, f"{lead} **Which column matches** `{concept}`? Options: {opts}"))
67
+
68
+ if "facility" in mapping.ambiguous:
69
+ _ask_disamb("facility", mapping.ambiguous["facility"], "Prioritization", "We need the facility identifier to aggregate by site.")
70
+ elif "facility" in mapping.missing:
71
+ groups.append(("Prioritization", "Provide the **facility/site** column to group results (name or ID)."))
72
+
73
+ if "specialty" in mapping.ambiguous:
74
+ _ask_disamb("specialty", mapping.ambiguous["specialty"], "Prioritization", "We need the specialty/service field to rank by specialty.")
75
+ elif "specialty" in mapping.missing:
76
+ groups.append(("Prioritization", "Provide the **specialty/service** column (e.g., General Surgery, Ortho)."))
77
+
78
+ if "capacity_beds" in mapping.ambiguous:
79
+ _ask_disamb("capacity_beds", mapping.ambiguous["capacity_beds"], "Capacity", "To estimate staffed capacity, confirm the **staffed beds** column.")
80
+ elif "capacity_beds" in mapping.missing:
81
+ groups.append(("Capacity", "Provide **staffed beds** (or equivalent capacity) column."))
82
+
83
+ if "clients_per_day" in mapping.missing:
84
+ groups.append(("Capacity", "What is the **clients per day** rate per team?"))
85
+ if "teams" in mapping.missing:
86
+ groups.append(("Capacity", "How many **teams** operate concurrently?"))
87
+
88
+ if "cost_fixed" in mapping.missing:
89
+ groups.append(("Cost", "Provide **fixed/startup cost** (or confirm none)."))
90
+ if "cost_variable" in mapping.missing:
91
+ groups.append(("Cost", "Provide **variable cost per client/case** (or unit definition)."))
92
+
93
+ any_wait = ("wait_median" in mapping.resolved) or ("wait_p90" in mapping.resolved) or ("wait_days" in mapping.resolved)
94
+ if not any_wait:
95
+ groups.append(("Clinical", "Which columns capture **wait times** (median/p90 or days)?"))
96
+
97
+ groups.append(("Recommendations", "Any operational constraints or equity priorities we should encode (scheduling limits, rural access, partnerships)?"))
98
+
99
+ groups = groups[:max_groups]
100
+ out = ["**Clarification Questions**"]
101
+ cur = None
102
+ for grp, q in groups:
103
+ if grp != cur:
104
+ out.append(f"\n**{grp}:**")
105
+ cur = grp
106
+ out.append(f"- {q}")
107
+ return "\n".join(out)