Spaces:
Sleeping
Sleeping
File size: 3,818 Bytes
097315f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | """Searchable codebook across both ELSI waves."""
from __future__ import annotations
import functools
import re
import pandas as pd
from app.config import MODULE_LABELS
from app.data.loader import load_all
def _module_prefix(var: str) -> str:
m = re.match(r"^([a-zA-Z]+)", var)
return m.group(1).lower() if m else "_other"
@functools.lru_cache(maxsize=1)
def build_codebook() -> pd.DataFrame:
"""Return one row per (wave, variable) with label, dtype, missingness, n_unique, value-label flag."""
bundles = load_all()
rows = []
for wave, b in bundles.items():
for var in b.variables():
s = b.df[var]
prefix = _module_prefix(var)
rows.append({
"wave": wave,
"variable": var,
"label": b.label_for(var),
"module_prefix": prefix,
"module": MODULE_LABELS.get(prefix, f"{prefix.upper()}. (unmapped)"),
"dtype": str(s.dtype),
"n_nonmiss": int(s.notna().sum()),
"pct_miss": round(s.isna().mean() * 100, 2),
"n_unique": int(s.nunique(dropna=True)),
"has_value_labels": var in b.value_labels,
})
cb = pd.DataFrame(rows)
return cb
def search_codebook(
query: str = "",
wave: int | None = None,
module_prefix: str | None = None,
max_pct_miss: float | None = None,
only_with_value_labels: bool = False,
limit: int = 200,
) -> pd.DataFrame:
cb = build_codebook()
out = cb.copy()
if wave is not None:
out = out[out["wave"] == wave]
if module_prefix:
out = out[out["module_prefix"] == module_prefix.lower()]
if max_pct_miss is not None:
out = out[out["pct_miss"] <= max_pct_miss]
if only_with_value_labels:
out = out[out["has_value_labels"]]
if query:
q = query.lower().strip()
# Match against variable name or label, case-insensitive, multi-token AND
tokens = [t for t in re.split(r"\s+", q) if t]
def hit(row):
blob = f"{row['variable']} {row['label']}".lower()
return all(t in blob for t in tokens)
out = out[out.apply(hit, axis=1)]
return out.head(limit).reset_index(drop=True)
def variable_detail(wave: int, variable: str) -> dict:
"""Detailed info for one variable: label, value labels, summary stats."""
bundles = load_all()
if wave not in bundles:
raise ValueError(f"Unknown wave {wave}")
b = bundles[wave]
if variable not in b.df.columns:
raise KeyError(f"{variable} not in wave {wave}")
s = b.df[variable]
info = {
"wave": wave,
"variable": variable,
"label": b.label_for(variable),
"module": MODULE_LABELS.get(_module_prefix(variable), ""),
"dtype": str(s.dtype),
"n_nonmiss": int(s.notna().sum()),
"pct_miss": round(s.isna().mean() * 100, 2),
"n_unique": int(s.nunique(dropna=True)),
"value_labels": b.value_map(variable),
}
# Numeric summary if numeric and has spread
if pd.api.types.is_numeric_dtype(s) and info["n_unique"] > 10:
info["summary"] = {
"mean": float(s.mean()),
"sd": float(s.std()),
"min": float(s.min()),
"p25": float(s.quantile(0.25)),
"median": float(s.median()),
"p75": float(s.quantile(0.75)),
"max": float(s.max()),
}
else:
# Categorical: top frequencies (using labeled values)
s_lab = b.df_labeled[variable]
vc = s_lab.value_counts(dropna=False).head(20)
info["top_categories"] = [
{"value": str(idx), "n": int(n), "pct": round(n / len(s_lab) * 100, 2)}
for idx, n in vc.items()
]
return info
|