essprasad commited on
Commit
eb85c59
·
verified ·
1 Parent(s): ef81201

Create variable_loader.py

Browse files
Files changed (1) hide show
  1. core/variable_loader.py +236 -0
core/variable_loader.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # core/variable_loader.py
2
+ import os
3
+ import glob
4
+ import json
5
+ import time
6
+ import logging
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ try:
11
+ import pandas as pd
12
+ except Exception:
13
+ pd = None
14
+
15
+ # cache path (temporary)
16
+ CACHE_PATH = "/tmp/ct_var_cache.json"
17
+ CACHE_TTL_SECONDS = 60 * 60 # 1 hour; adjust as needed
18
+
19
+ # candidate filenames / patterns to detect relevant excel files
20
+ DEFAULT_PATTERNS = [
21
+ "*SDTM*.xls*", "*SDTMIG*.xls*", "*SDTM_*.xls*", "SDTM*.xls*",
22
+ "*ADaM*.xls*", "*ADaMIG*.xls*", "ADaM*.xls*",
23
+ "*CDASH*.xls*", "*CDASHIG*.xls*", "CDASH*.xls*"
24
+ ]
25
+
26
+ # Typical column name candidates
27
+ VAR_COL_CANDIDATES = [
28
+ "variable", "variable name", "varname", "var", "column", "fieldname"
29
+ ]
30
+ LABEL_COL_CANDIDATES = [
31
+ "label", "variable label", "var label", "column label"
32
+ ]
33
+ DESC_COL_CANDIDATES = [
34
+ "description", "definition", "long name", "comments", "notes"
35
+ ]
36
+ ROLE_COL_CANDIDATES = [
37
+ "role", "type", "datatype", "origin"
38
+ ]
39
+
40
+
41
+ def _first_existing(columns, candidates):
42
+ if not columns:
43
+ return None
44
+ low = {c.strip().lower(): c for c in columns}
45
+ for cand in candidates:
46
+ for k, orig in low.items():
47
+ if cand == k or cand in k:
48
+ return orig
49
+ return None
50
+
51
+
52
+ def _discover_files(search_paths=None, patterns=None):
53
+ patterns = patterns or DEFAULT_PATTERNS
54
+ search_paths = search_paths or [
55
+ ".", "/workspace/data", "/mnt/data", os.getcwd(),
56
+ "/root/.cache/huggingface/hub", "/home/user/.cache/huggingface/hub",
57
+ "/root/.cache/huggingface/hub/datasets--essprasad--CT-Chat-Docs",
58
+ "/home/user/.cache/huggingface/hub/datasets--essprasad--CT-Chat-Docs",
59
+ ]
60
+ found = []
61
+ for base in search_paths:
62
+ if not base or not os.path.exists(base):
63
+ continue
64
+ for pat in patterns:
65
+ try:
66
+ matches = glob.glob(os.path.join(base, pat), recursive=True)
67
+ for m in matches:
68
+ if os.path.isfile(m) and m.lower().endswith((".xls", ".xlsx")):
69
+ found.append(os.path.abspath(m))
70
+ except Exception:
71
+ continue
72
+ # dedupe but keep order
73
+ seen = set()
74
+ unique = []
75
+ for p in found:
76
+ if p not in seen:
77
+ seen.add(p)
78
+ unique.append(p)
79
+ return unique
80
+
81
+
82
+ def _extract_from_df(df, filename):
83
+ """
84
+ Given a dataframe, find likely variable/label/description columns and extract rows.
85
+ Returns list of dicts.
86
+ """
87
+ out = []
88
+ if df is None or df.shape[0] == 0:
89
+ return out
90
+
91
+ cols = list(df.columns)
92
+ term_col = _first_existing(cols, VAR_COL_CANDIDATES)
93
+ label_col = _first_existing(cols, LABEL_COL_CANDIDATES)
94
+ desc_col = _first_existing(cols, DESC_COL_CANDIDATES)
95
+ role_col = _first_existing(cols, ROLE_COL_CANDIDATES)
96
+
97
+ # If we absolutely cannot find a term column, try first column
98
+ if not term_col:
99
+ term_col = cols[0] if cols else None
100
+
101
+ # If there's absolutely no useful columns, give up
102
+ if not term_col:
103
+ return out
104
+
105
+ for _, row in df.iterrows():
106
+ try:
107
+ term = str(row.get(term_col, "") or "").strip()
108
+ except Exception:
109
+ term = ""
110
+ if not term:
111
+ continue
112
+
113
+ label = ""
114
+ desc = ""
115
+ role = ""
116
+ try:
117
+ label = str(row.get(label_col, "") or "").strip() if label_col in df.columns else ""
118
+ except Exception:
119
+ label = ""
120
+ try:
121
+ desc = str(row.get(desc_col, "") or "").strip() if desc_col in df.columns else ""
122
+ except Exception:
123
+ desc = ""
124
+ try:
125
+ role = str(row.get(role_col, "") or "").strip() if role_col in df.columns else ""
126
+ except Exception:
127
+ role = ""
128
+
129
+ # Compose a clean definition
130
+ parts = []
131
+ if label:
132
+ parts.append(f"Label: {label}")
133
+ if desc:
134
+ parts.append(f"Description: {desc}")
135
+ if role:
136
+ parts.append(f"Role/Origin: {role}")
137
+ definition = " \n".join(parts).strip() or (label or desc or "")
138
+
139
+ out.append({
140
+ "term": term,
141
+ "definition": definition,
142
+ "file": os.path.basename(filename),
143
+ "type": "variable",
144
+ "sources": [os.path.basename(filename)]
145
+ })
146
+
147
+ return out
148
+
149
+
150
+ def load_variable_metadata(search_paths=None, use_cache=True, verbose=True):
151
+ """
152
+ Discover SDTM/ADaM/CDASH excel files and extract variable metadata.
153
+ Returns list of dicts: {'term','definition','file','type','sources'}
154
+ """
155
+
156
+ # quick fail if pandas not installed
157
+ if pd is None:
158
+ logger.warning("pandas not available — variable metadata loading skipped.")
159
+ return []
160
+
161
+ # cache handling
162
+ try:
163
+ if use_cache and os.path.exists(CACHE_PATH):
164
+ mtime = os.path.getmtime(CACHE_PATH)
165
+ if time.time() - mtime < CACHE_TTL_SECONDS:
166
+ if verbose:
167
+ logger.info("Loading variable metadata from cache: %s", CACHE_PATH)
168
+ with open(CACHE_PATH, "r", encoding="utf-8") as f:
169
+ return json.load(f)
170
+ except Exception:
171
+ # continue if cache read fails
172
+ pass
173
+
174
+ files = _discover_files(search_paths=search_paths)
175
+ if verbose:
176
+ logger.info("Variable loader discovered %d candidate Excel files.", len(files))
177
+
178
+ all_entries = []
179
+ for fx in files:
180
+ try:
181
+ # read all sheets (ExcelFile faster for many sheets)
182
+ xls = pd.ExcelFile(fx)
183
+ # iterate sheets:
184
+ for sheet in xls.sheet_names:
185
+ try:
186
+ df = pd.read_excel(fx, sheet_name=sheet)
187
+ # drop rows where all cells are NaN
188
+ df = df.dropna(how="all")
189
+ entries = _extract_from_df(df, fx)
190
+ if entries:
191
+ # annotate with sheet name to improve provenance
192
+ for e in entries:
193
+ e["sources"].append(f"{os.path.basename(fx)}::{sheet}")
194
+ all_entries.extend(entries)
195
+ except Exception:
196
+ # try next sheet
197
+ continue
198
+ except Exception:
199
+ # fallback: try single-sheet read
200
+ try:
201
+ df = pd.read_excel(fx)
202
+ df = df.dropna(how="all")
203
+ entries = _extract_from_df(df, fx)
204
+ all_entries.extend(entries)
205
+ except Exception as e:
206
+ logger.debug("Failed reading excel %s: %s", fx, e)
207
+ continue
208
+
209
+ # dedupe by term (keep first occurrence)
210
+ seen = {}
211
+ deduped = []
212
+ for e in all_entries:
213
+ key = (e["term"].strip().lower())
214
+ if key and key not in seen:
215
+ seen[key] = True
216
+ deduped.append(e)
217
+
218
+ # write cache
219
+ try:
220
+ with open(CACHE_PATH, "w", encoding="utf-8") as f:
221
+ json.dump(deduped, f, ensure_ascii=False, indent=2)
222
+ except Exception:
223
+ pass
224
+
225
+ if verbose:
226
+ logger.info("Variable loader extracted %d unique variables.", len(deduped))
227
+
228
+ return deduped
229
+
230
+
231
+ if __name__ == "__main__":
232
+ # quick CLI for debugging
233
+ items = load_variable_metadata(verbose=True)
234
+ print(f"[variable_loader] extracted {len(items)} items")
235
+ if items:
236
+ print("Sample:", items[:5])