Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple | |
| import pandas as pd | |
| from pyscbwrapper import SCB | |
| # ===================== # | |
| # ---- CONFIG AREA ---- # | |
| # ===================== # | |
| TAX_ID = "ssyk2012" # or "ssyk96" | |
| TABLES: Dict[str, Tuple[str, ...]] = { | |
| "ssyk2012_tab": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"), | |
| "ssyk96_tab": ("en", "AM", "AM0208", "AM0208E", "YREG33"), | |
| } | |
| def get_project_root() -> Path: | |
| try: | |
| return Path(__file__).resolve().parents[1] | |
| except NameError: | |
| return Path.cwd().resolve().parents[1] | |
| # ========================= # | |
| # ---- CORE FUNCTIONS ---- # | |
| # ========================= # | |
| def latest_year_from_vars(vars_: Dict[str, List[str]]) -> str: | |
| years = [] | |
| for y in vars_.get("year", []): | |
| try: | |
| years.append(int(y)) | |
| except Exception: | |
| pass | |
| return str(max(years)) | |
| def fetch_scb_metadata_and_year(tax_id: str): | |
| scb = SCB(*TABLES[f"{tax_id}_tab"]) | |
| vars_ = scb.get_variables() | |
| occupations_key, occupations = next(iter(vars_.items())) | |
| clean_key = occupations_key.replace(" ", "") # wrapper quirk | |
| latest_year = latest_year_from_vars(vars_) | |
| return scb, clean_key, occupations, latest_year | |
| def fetch_data_for_year( | |
| scb: SCB, occ_key: str, occupations: List[str], year: str | |
| ) -> Tuple[List[dict], Dict[str, str]]: | |
| scb.set_query(**{occ_key: occupations, "year": [year]}) | |
| scb_data = scb.get_data() | |
| scb_fetch = scb_data["data"] | |
| codes = scb.get_query()["query"][0]["selection"]["values"] | |
| occ_dict = dict(zip(codes, occupations)) | |
| return scb_fetch, occ_dict | |
| def build_weights_df(scb_fetch: List[dict], occ_dict: Dict[str, str]) -> pd.DataFrame: | |
| # raw records | |
| records = [] | |
| for r in scb_fetch: | |
| code, year = r["key"][:2] # occupation code, year | |
| name = occ_dict.get(code, code) | |
| value = r["values"][0] | |
| records.append( | |
| {"code_4": code, "occupation": name, "year": year, "value": value} | |
| ) | |
| df = pd.DataFrame(records) | |
| df = df[df["code_4"] != "0002"].reset_index(drop=True) ## remove unidentified group | |
| # keep year as string, codes as strings; value numeric | |
| df["year"] = df["year"].astype(str) | |
| df["value"] = pd.to_numeric(df["value"], errors="coerce") | |
| df = df.dropna(subset=["value"]).reset_index(drop=True) | |
| df["value"] = df["value"].astype(int) | |
| df = df[df["value"] > 0].reset_index(drop=True) | |
| # code hierarchy | |
| df["code_4"] = ( | |
| df["code_4"] | |
| .astype(str) | |
| .str.replace(r"\D", "", regex=True) | |
| .str.slice(0, 4) | |
| .str.zfill(4) | |
| ) | |
| df["code_3"] = df["code_4"].str[:3] | |
| df["code_2"] = df["code_4"].str[:2] | |
| df["code_1"] = df["code_4"].str[0] | |
| # aggregated totals per level (by year) — sums are ints; codes remain strings | |
| for level in ["code_1", "code_2", "code_3"]: | |
| df[f"value_{level[-1]}"] = ( | |
| df.groupby(["year", level])["value"].transform("sum").astype(int) | |
| ) | |
| # weights (shares) within each higher level | |
| df["weight_in_code_3"] = (df["value"] / df["value_3"]).round(6) | |
| df["weight_in_code_2"] = (df["value"] / df["value_2"]).round(6) | |
| df["weight_in_code_1"] = (df["value"] / df["value_1"]).round(6) | |
| # final columns for export | |
| df = df[ | |
| [ | |
| "code_4", | |
| "occupation", | |
| "value", | |
| "weight_in_code_3", | |
| "weight_in_code_2", | |
| "weight_in_code_1", | |
| "year", | |
| ] | |
| ] | |
| return df | |
| def write_weights(df: pd.DataFrame, tax_id: str, year: str) -> Path: | |
| root = get_project_root() | |
| out_path = root / "data" / "scb_weights" / f"{tax_id}_weights_en_{year}.csv" | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| df.to_csv(out_path, index=False) | |
| return out_path | |
| # ===================== # | |
| # -------- CLI -------- # | |
| # ===================== # | |
| def main(tax_id: str = TAX_ID): | |
| scb, occ_key, occupations, latest_year = fetch_scb_metadata_and_year(tax_id) | |
| scb_fetch, occ_dict = fetch_data_for_year(scb, occ_key, occupations, latest_year) | |
| df = build_weights_df(scb_fetch, occ_dict) | |
| out_path = write_weights(df, tax_id, latest_year) | |
| print(f"✅ Wrote: {out_path.resolve()}") | |
| return df | |
| if __name__ == "__main__": | |
| main() | |