dummy_app / scripts /02_scbpull_AI_assisted.py
joseph-data's picture
dummy app start
1264273 unverified
from __future__ import annotations
from pathlib import Path
from typing import Dict, List, Tuple
import pandas as pd
from pyscbwrapper import SCB
# ===================== #
# ---- CONFIG AREA ---- #
# ===================== #
TAX_ID = "ssyk2012" # or "ssyk96"
TABLES: Dict[str, Tuple[str, ...]] = {
"ssyk2012_tab": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"),
"ssyk96_tab": ("en", "AM", "AM0208", "AM0208E", "YREG33"),
}
def get_project_root() -> Path:
try:
return Path(__file__).resolve().parents[1]
except NameError:
return Path.cwd().resolve().parents[1]
# ========================= #
# ---- CORE FUNCTIONS ---- #
# ========================= #
def latest_year_from_vars(vars_: Dict[str, List[str]]) -> str:
years = []
for y in vars_.get("year", []):
try:
years.append(int(y))
except Exception:
pass
return str(max(years))
def fetch_scb_metadata_and_year(tax_id: str):
scb = SCB(*TABLES[f"{tax_id}_tab"])
vars_ = scb.get_variables()
occupations_key, occupations = next(iter(vars_.items()))
clean_key = occupations_key.replace(" ", "") # wrapper quirk
latest_year = latest_year_from_vars(vars_)
return scb, clean_key, occupations, latest_year
def fetch_data_for_year(
scb: SCB, occ_key: str, occupations: List[str], year: str
) -> Tuple[List[dict], Dict[str, str]]:
scb.set_query(**{occ_key: occupations, "year": [year]})
scb_data = scb.get_data()
scb_fetch = scb_data["data"]
codes = scb.get_query()["query"][0]["selection"]["values"]
occ_dict = dict(zip(codes, occupations))
return scb_fetch, occ_dict
def build_weights_df(scb_fetch: List[dict], occ_dict: Dict[str, str]) -> pd.DataFrame:
# raw records
records = []
for r in scb_fetch:
code, year = r["key"][:2] # occupation code, year
name = occ_dict.get(code, code)
value = r["values"][0]
records.append(
{"code_4": code, "occupation": name, "year": year, "value": value}
)
df = pd.DataFrame(records)
df = df[df["code_4"] != "0002"].reset_index(drop=True) ## remove unidentified group
# keep year as string, codes as strings; value numeric
df["year"] = df["year"].astype(str)
df["value"] = pd.to_numeric(df["value"], errors="coerce")
df = df.dropna(subset=["value"]).reset_index(drop=True)
df["value"] = df["value"].astype(int)
df = df[df["value"] > 0].reset_index(drop=True)
# code hierarchy
df["code_4"] = (
df["code_4"]
.astype(str)
.str.replace(r"\D", "", regex=True)
.str.slice(0, 4)
.str.zfill(4)
)
df["code_3"] = df["code_4"].str[:3]
df["code_2"] = df["code_4"].str[:2]
df["code_1"] = df["code_4"].str[0]
# aggregated totals per level (by year) — sums are ints; codes remain strings
for level in ["code_1", "code_2", "code_3"]:
df[f"value_{level[-1]}"] = (
df.groupby(["year", level])["value"].transform("sum").astype(int)
)
# weights (shares) within each higher level
df["weight_in_code_3"] = (df["value"] / df["value_3"]).round(6)
df["weight_in_code_2"] = (df["value"] / df["value_2"]).round(6)
df["weight_in_code_1"] = (df["value"] / df["value_1"]).round(6)
# final columns for export
df = df[
[
"code_4",
"occupation",
"value",
"weight_in_code_3",
"weight_in_code_2",
"weight_in_code_1",
"year",
]
]
return df
def write_weights(df: pd.DataFrame, tax_id: str, year: str) -> Path:
root = get_project_root()
out_path = root / "data" / "scb_weights" / f"{tax_id}_weights_en_{year}.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)
return out_path
# ===================== #
# -------- CLI -------- #
# ===================== #
def main(tax_id: str = TAX_ID):
scb, occ_key, occupations, latest_year = fetch_scb_metadata_and_year(tax_id)
scb_fetch, occ_dict = fetch_data_for_year(scb, occ_key, occupations, latest_year)
df = build_weights_df(scb_fetch, occ_dict)
out_path = write_weights(df, tax_id, latest_year)
print(f"✅ Wrote: {out_path.resolve()}")
return df
if __name__ == "__main__":
main()