Spaces:
Sleeping
Sleeping
| """Searchable codebook across both ELSI waves.""" | |
| from __future__ import annotations | |
| import functools | |
| import re | |
| import pandas as pd | |
| from app.config import MODULE_LABELS | |
| from app.data.loader import load_all | |
| def _module_prefix(var: str) -> str: | |
| m = re.match(r"^([a-zA-Z]+)", var) | |
| return m.group(1).lower() if m else "_other" | |
| def build_codebook() -> pd.DataFrame: | |
| """Return one row per (wave, variable) with label, dtype, missingness, n_unique, value-label flag.""" | |
| bundles = load_all() | |
| rows = [] | |
| for wave, b in bundles.items(): | |
| for var in b.variables(): | |
| s = b.df[var] | |
| prefix = _module_prefix(var) | |
| rows.append({ | |
| "wave": wave, | |
| "variable": var, | |
| "label": b.label_for(var), | |
| "module_prefix": prefix, | |
| "module": MODULE_LABELS.get(prefix, f"{prefix.upper()}. (unmapped)"), | |
| "dtype": str(s.dtype), | |
| "n_nonmiss": int(s.notna().sum()), | |
| "pct_miss": round(s.isna().mean() * 100, 2), | |
| "n_unique": int(s.nunique(dropna=True)), | |
| "has_value_labels": var in b.value_labels, | |
| }) | |
| cb = pd.DataFrame(rows) | |
| return cb | |
| def search_codebook( | |
| query: str = "", | |
| wave: int | None = None, | |
| module_prefix: str | None = None, | |
| max_pct_miss: float | None = None, | |
| only_with_value_labels: bool = False, | |
| limit: int = 200, | |
| ) -> pd.DataFrame: | |
| cb = build_codebook() | |
| out = cb.copy() | |
| if wave is not None: | |
| out = out[out["wave"] == wave] | |
| if module_prefix: | |
| out = out[out["module_prefix"] == module_prefix.lower()] | |
| if max_pct_miss is not None: | |
| out = out[out["pct_miss"] <= max_pct_miss] | |
| if only_with_value_labels: | |
| out = out[out["has_value_labels"]] | |
| if query: | |
| q = query.lower().strip() | |
| # Match against variable name or label, case-insensitive, multi-token AND | |
| tokens = [t for t in re.split(r"\s+", q) if t] | |
| def hit(row): | |
| blob = f"{row['variable']} {row['label']}".lower() | |
| return all(t in blob for t in tokens) | |
| out = out[out.apply(hit, axis=1)] | |
| return out.head(limit).reset_index(drop=True) | |
| def variable_detail(wave: int, variable: str) -> dict: | |
| """Detailed info for one variable: label, value labels, summary stats.""" | |
| bundles = load_all() | |
| if wave not in bundles: | |
| raise ValueError(f"Unknown wave {wave}") | |
| b = bundles[wave] | |
| if variable not in b.df.columns: | |
| raise KeyError(f"{variable} not in wave {wave}") | |
| s = b.df[variable] | |
| info = { | |
| "wave": wave, | |
| "variable": variable, | |
| "label": b.label_for(variable), | |
| "module": MODULE_LABELS.get(_module_prefix(variable), ""), | |
| "dtype": str(s.dtype), | |
| "n_nonmiss": int(s.notna().sum()), | |
| "pct_miss": round(s.isna().mean() * 100, 2), | |
| "n_unique": int(s.nunique(dropna=True)), | |
| "value_labels": b.value_map(variable), | |
| } | |
| # Numeric summary if numeric and has spread | |
| if pd.api.types.is_numeric_dtype(s) and info["n_unique"] > 10: | |
| info["summary"] = { | |
| "mean": float(s.mean()), | |
| "sd": float(s.std()), | |
| "min": float(s.min()), | |
| "p25": float(s.quantile(0.25)), | |
| "median": float(s.median()), | |
| "p75": float(s.quantile(0.75)), | |
| "max": float(s.max()), | |
| } | |
| else: | |
| # Categorical: top frequencies (using labeled values) | |
| s_lab = b.df_labeled[variable] | |
| vc = s_lab.value_counts(dropna=False).head(20) | |
| info["top_categories"] = [ | |
| {"value": str(idx), "n": int(n), "pct": round(n / len(s_lab) * 100, 2)} | |
| for idx, n in vc.items() | |
| ] | |
| return info | |