"""Searchable codebook across both ELSI waves.""" from __future__ import annotations import functools import re import pandas as pd from app.config import MODULE_LABELS from app.data.loader import load_all def _module_prefix(var: str) -> str: m = re.match(r"^([a-zA-Z]+)", var) return m.group(1).lower() if m else "_other" @functools.lru_cache(maxsize=1) def build_codebook() -> pd.DataFrame: """Return one row per (wave, variable) with label, dtype, missingness, n_unique, value-label flag.""" bundles = load_all() rows = [] for wave, b in bundles.items(): for var in b.variables(): s = b.df[var] prefix = _module_prefix(var) rows.append({ "wave": wave, "variable": var, "label": b.label_for(var), "module_prefix": prefix, "module": MODULE_LABELS.get(prefix, f"{prefix.upper()}. (unmapped)"), "dtype": str(s.dtype), "n_nonmiss": int(s.notna().sum()), "pct_miss": round(s.isna().mean() * 100, 2), "n_unique": int(s.nunique(dropna=True)), "has_value_labels": var in b.value_labels, }) cb = pd.DataFrame(rows) return cb def search_codebook( query: str = "", wave: int | None = None, module_prefix: str | None = None, max_pct_miss: float | None = None, only_with_value_labels: bool = False, limit: int = 200, ) -> pd.DataFrame: cb = build_codebook() out = cb.copy() if wave is not None: out = out[out["wave"] == wave] if module_prefix: out = out[out["module_prefix"] == module_prefix.lower()] if max_pct_miss is not None: out = out[out["pct_miss"] <= max_pct_miss] if only_with_value_labels: out = out[out["has_value_labels"]] if query: q = query.lower().strip() # Match against variable name or label, case-insensitive, multi-token AND tokens = [t for t in re.split(r"\s+", q) if t] def hit(row): blob = f"{row['variable']} {row['label']}".lower() return all(t in blob for t in tokens) out = out[out.apply(hit, axis=1)] return out.head(limit).reset_index(drop=True) def variable_detail(wave: int, variable: str) -> dict: """Detailed info for one variable: label, value labels, summary stats.""" bundles = load_all() if wave not in bundles: raise ValueError(f"Unknown wave {wave}") b = bundles[wave] if variable not in b.df.columns: raise KeyError(f"{variable} not in wave {wave}") s = b.df[variable] info = { "wave": wave, "variable": variable, "label": b.label_for(variable), "module": MODULE_LABELS.get(_module_prefix(variable), ""), "dtype": str(s.dtype), "n_nonmiss": int(s.notna().sum()), "pct_miss": round(s.isna().mean() * 100, 2), "n_unique": int(s.nunique(dropna=True)), "value_labels": b.value_map(variable), } # Numeric summary if numeric and has spread if pd.api.types.is_numeric_dtype(s) and info["n_unique"] > 10: info["summary"] = { "mean": float(s.mean()), "sd": float(s.std()), "min": float(s.min()), "p25": float(s.quantile(0.25)), "median": float(s.median()), "p75": float(s.quantile(0.75)), "max": float(s.max()), } else: # Categorical: top frequencies (using labeled values) s_lab = b.df_labeled[variable] vc = s_lab.value_counts(dropna=False).head(20) info["top_categories"] = [ {"value": str(idx), "n": int(n), "pct": round(n / len(s_lab) * 100, 2)} for idx, n in vc.items() ] return info