from __future__ import annotations from pathlib import Path from typing import Dict, List, Tuple import pandas as pd from pyscbwrapper import SCB # ===================== # # ---- CONFIG AREA ---- # # ===================== # TAX_ID = "ssyk2012" # or "ssyk96" TABLES: Dict[str, Tuple[str, ...]] = { "ssyk2012_tab": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"), "ssyk96_tab": ("en", "AM", "AM0208", "AM0208E", "YREG33"), } def get_project_root() -> Path: try: return Path(__file__).resolve().parents[1] except NameError: return Path.cwd().resolve().parents[1] # ========================= # # ---- CORE FUNCTIONS ---- # # ========================= # def latest_year_from_vars(vars_: Dict[str, List[str]]) -> str: years = [] for y in vars_.get("year", []): try: years.append(int(y)) except Exception: pass return str(max(years)) def fetch_scb_metadata_and_year(tax_id: str): scb = SCB(*TABLES[f"{tax_id}_tab"]) vars_ = scb.get_variables() occupations_key, occupations = next(iter(vars_.items())) clean_key = occupations_key.replace(" ", "") # wrapper quirk latest_year = latest_year_from_vars(vars_) return scb, clean_key, occupations, latest_year def fetch_data_for_year( scb: SCB, occ_key: str, occupations: List[str], year: str ) -> Tuple[List[dict], Dict[str, str]]: scb.set_query(**{occ_key: occupations, "year": [year]}) scb_data = scb.get_data() scb_fetch = scb_data["data"] codes = scb.get_query()["query"][0]["selection"]["values"] occ_dict = dict(zip(codes, occupations)) return scb_fetch, occ_dict def build_weights_df(scb_fetch: List[dict], occ_dict: Dict[str, str]) -> pd.DataFrame: # raw records records = [] for r in scb_fetch: code, year = r["key"][:2] # occupation code, year name = occ_dict.get(code, code) value = r["values"][0] records.append( {"code_4": code, "occupation": name, "year": year, "value": value} ) df = pd.DataFrame(records) df = df[df["code_4"] != "0002"].reset_index(drop=True) ## remove unidentified group # keep year as string, codes as strings; value numeric df["year"] = df["year"].astype(str) df["value"] = pd.to_numeric(df["value"], errors="coerce") df = df.dropna(subset=["value"]).reset_index(drop=True) df["value"] = df["value"].astype(int) df = df[df["value"] > 0].reset_index(drop=True) # code hierarchy df["code_4"] = ( df["code_4"] .astype(str) .str.replace(r"\D", "", regex=True) .str.slice(0, 4) .str.zfill(4) ) df["code_3"] = df["code_4"].str[:3] df["code_2"] = df["code_4"].str[:2] df["code_1"] = df["code_4"].str[0] # aggregated totals per level (by year) — sums are ints; codes remain strings for level in ["code_1", "code_2", "code_3"]: df[f"value_{level[-1]}"] = ( df.groupby(["year", level])["value"].transform("sum").astype(int) ) # weights (shares) within each higher level df["weight_in_code_3"] = (df["value"] / df["value_3"]).round(6) df["weight_in_code_2"] = (df["value"] / df["value_2"]).round(6) df["weight_in_code_1"] = (df["value"] / df["value_1"]).round(6) # final columns for export df = df[ [ "code_4", "occupation", "value", "weight_in_code_3", "weight_in_code_2", "weight_in_code_1", "year", ] ] return df def write_weights(df: pd.DataFrame, tax_id: str, year: str) -> Path: root = get_project_root() out_path = root / "data" / "scb_weights" / f"{tax_id}_weights_en_{year}.csv" out_path.parent.mkdir(parents=True, exist_ok=True) df.to_csv(out_path, index=False) return out_path # ===================== # # -------- CLI -------- # # ===================== # def main(tax_id: str = TAX_ID): scb, occ_key, occupations, latest_year = fetch_scb_metadata_and_year(tax_id) scb_fetch, occ_dict = fetch_data_for_year(scb, occ_key, occupations, latest_year) df = build_weights_df(scb_fetch, occ_dict) out_path = write_weights(df, tax_id, latest_year) print(f"✅ Wrote: {out_path.resolve()}") return df if __name__ == "__main__": main()