File size: 3,818 Bytes
097315f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""Searchable codebook across both ELSI waves."""
from __future__ import annotations
import functools
import re
import pandas as pd
from app.config import MODULE_LABELS
from app.data.loader import load_all


def _module_prefix(var: str) -> str:
    m = re.match(r"^([a-zA-Z]+)", var)
    return m.group(1).lower() if m else "_other"


@functools.lru_cache(maxsize=1)
def build_codebook() -> pd.DataFrame:
    """Return one row per (wave, variable) with label, dtype, missingness, n_unique, value-label flag."""
    bundles = load_all()
    rows = []
    for wave, b in bundles.items():
        for var in b.variables():
            s = b.df[var]
            prefix = _module_prefix(var)
            rows.append({
                "wave": wave,
                "variable": var,
                "label": b.label_for(var),
                "module_prefix": prefix,
                "module": MODULE_LABELS.get(prefix, f"{prefix.upper()}. (unmapped)"),
                "dtype": str(s.dtype),
                "n_nonmiss": int(s.notna().sum()),
                "pct_miss": round(s.isna().mean() * 100, 2),
                "n_unique": int(s.nunique(dropna=True)),
                "has_value_labels": var in b.value_labels,
            })
    cb = pd.DataFrame(rows)
    return cb


def search_codebook(
    query: str = "",
    wave: int | None = None,
    module_prefix: str | None = None,
    max_pct_miss: float | None = None,
    only_with_value_labels: bool = False,
    limit: int = 200,
) -> pd.DataFrame:
    cb = build_codebook()
    out = cb.copy()
    if wave is not None:
        out = out[out["wave"] == wave]
    if module_prefix:
        out = out[out["module_prefix"] == module_prefix.lower()]
    if max_pct_miss is not None:
        out = out[out["pct_miss"] <= max_pct_miss]
    if only_with_value_labels:
        out = out[out["has_value_labels"]]
    if query:
        q = query.lower().strip()
        # Match against variable name or label, case-insensitive, multi-token AND
        tokens = [t for t in re.split(r"\s+", q) if t]
        def hit(row):
            blob = f"{row['variable']} {row['label']}".lower()
            return all(t in blob for t in tokens)
        out = out[out.apply(hit, axis=1)]
    return out.head(limit).reset_index(drop=True)


def variable_detail(wave: int, variable: str) -> dict:
    """Detailed info for one variable: label, value labels, summary stats."""
    bundles = load_all()
    if wave not in bundles:
        raise ValueError(f"Unknown wave {wave}")
    b = bundles[wave]
    if variable not in b.df.columns:
        raise KeyError(f"{variable} not in wave {wave}")
    s = b.df[variable]
    info = {
        "wave": wave,
        "variable": variable,
        "label": b.label_for(variable),
        "module": MODULE_LABELS.get(_module_prefix(variable), ""),
        "dtype": str(s.dtype),
        "n_nonmiss": int(s.notna().sum()),
        "pct_miss": round(s.isna().mean() * 100, 2),
        "n_unique": int(s.nunique(dropna=True)),
        "value_labels": b.value_map(variable),
    }
    # Numeric summary if numeric and has spread
    if pd.api.types.is_numeric_dtype(s) and info["n_unique"] > 10:
        info["summary"] = {
            "mean": float(s.mean()),
            "sd": float(s.std()),
            "min": float(s.min()),
            "p25": float(s.quantile(0.25)),
            "median": float(s.median()),
            "p75": float(s.quantile(0.75)),
            "max": float(s.max()),
        }
    else:
        # Categorical: top frequencies (using labeled values)
        s_lab = b.df_labeled[variable]
        vc = s_lab.value_counts(dropna=False).head(20)
        info["top_categories"] = [
            {"value": str(idx), "n": int(n), "pct": round(n / len(s_lab) * 100, 2)}
            for idx, n in vc.items()
        ]
    return info