Spaces:

mmrech
/

elsi-explorer

Sleeping

Matheus Rech

Initial commit: ELSI Explorer v0.1

097315f about 2 months ago

3.82 kB

	"""Searchable codebook across both ELSI waves."""
	from __future__ import annotations
	import functools
	import re
	import pandas as pd
	from app.config import MODULE_LABELS
	from app.data.loader import load_all


	def _module_prefix(var: str) -> str:
	m = re.match(r"^([a-zA-Z]+)", var)
	return m.group(1).lower() if m else "_other"


	@functools.lru_cache(maxsize=1)
	def build_codebook() -> pd.DataFrame:
	"""Return one row per (wave, variable) with label, dtype, missingness, n_unique, value-label flag."""
	bundles = load_all()
	rows = []
	for wave, b in bundles.items():
	for var in b.variables():
	s = b.df[var]
	prefix = _module_prefix(var)
	rows.append({
	"wave": wave,
	"variable": var,
	"label": b.label_for(var),
	"module_prefix": prefix,
	"module": MODULE_LABELS.get(prefix, f"{prefix.upper()}. (unmapped)"),
	"dtype": str(s.dtype),
	"n_nonmiss": int(s.notna().sum()),
	"pct_miss": round(s.isna().mean() * 100, 2),
	"n_unique": int(s.nunique(dropna=True)),
	"has_value_labels": var in b.value_labels,
	})
	cb = pd.DataFrame(rows)
	return cb


	def search_codebook(
	query: str = "",
	wave: int \| None = None,
	module_prefix: str \| None = None,
	max_pct_miss: float \| None = None,
	only_with_value_labels: bool = False,
	limit: int = 200,
	) -> pd.DataFrame:
	cb = build_codebook()
	out = cb.copy()
	if wave is not None:
	out = out[out["wave"] == wave]
	if module_prefix:
	out = out[out["module_prefix"] == module_prefix.lower()]
	if max_pct_miss is not None:
	out = out[out["pct_miss"] <= max_pct_miss]
	if only_with_value_labels:
	out = out[out["has_value_labels"]]
	if query:
	q = query.lower().strip()
	# Match against variable name or label, case-insensitive, multi-token AND
	tokens = [t for t in re.split(r"\s+", q) if t]
	def hit(row):
	blob = f"{row['variable']} {row['label']}".lower()
	return all(t in blob for t in tokens)
	out = out[out.apply(hit, axis=1)]
	return out.head(limit).reset_index(drop=True)


	def variable_detail(wave: int, variable: str) -> dict:
	"""Detailed info for one variable: label, value labels, summary stats."""
	bundles = load_all()
	if wave not in bundles:
	raise ValueError(f"Unknown wave {wave}")
	b = bundles[wave]
	if variable not in b.df.columns:
	raise KeyError(f"{variable} not in wave {wave}")
	s = b.df[variable]
	info = {
	"wave": wave,
	"variable": variable,
	"label": b.label_for(variable),
	"module": MODULE_LABELS.get(_module_prefix(variable), ""),
	"dtype": str(s.dtype),
	"n_nonmiss": int(s.notna().sum()),
	"pct_miss": round(s.isna().mean() * 100, 2),
	"n_unique": int(s.nunique(dropna=True)),
	"value_labels": b.value_map(variable),
	}
	# Numeric summary if numeric and has spread
	if pd.api.types.is_numeric_dtype(s) and info["n_unique"] > 10:
	info["summary"] = {
	"mean": float(s.mean()),
	"sd": float(s.std()),
	"min": float(s.min()),
	"p25": float(s.quantile(0.25)),
	"median": float(s.median()),
	"p75": float(s.quantile(0.75)),
	"max": float(s.max()),
	}
	else:
	# Categorical: top frequencies (using labeled values)
	s_lab = b.df_labeled[variable]
	vc = s_lab.value_counts(dropna=False).head(20)
	info["top_categories"] = [
	{"value": str(idx), "n": int(n), "pct": round(n / len(s_lab) * 100, 2)}
	for idx, n in vc.items()
	]
	return info