## Required Libraries import pandas as pd from pyscbwrapper import SCB from pathlib import Path ## Sorting the working directory # ROOT = Path(__file__).resolve().parents[1] ## for scripts ROOT = Path.cwd().resolve().parents[1] ## for notebooks ## Define the Table Nodes - Can be changed TAX_ID = "ssyk2012" # or ssyk96 Classification TABLES = { "ssyk2012_tab": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"), "ssyk96_tab": ("en", "AM", "AM0208", "AM0208E", "YREG33"), } scb = SCB(*TABLES[f"{TAX_ID}_tab"]) # scb.info() var_ = scb.get_variables() occupations_key, occupations = next(iter(var_.items())) clean_key = occupations_key.replace(" ", "") # Years can be strings; coerce to int safely and pick max def coerce_year(y): try: return int(y) except Exception: return None years = [coerce_year(y) for y in var_["year"]] years = [y for y in years if y is not None] latest_year = str(max(years)) # ========= 3) Build query (match exact variable names) ========= # scb.set_query( **{ clean_key: occupations, "year": [latest_year], } ) scb_data = scb.get_data() scb_fetch = scb_data["data"] codes = scb.get_query()["query"][0]["selection"]["values"] occ_dict = dict(zip(codes, occupations)) # CREATING CLEANED DATAFRAME records = [] for r in scb_fetch: code, year = r["key"][:2] # occupation code, year name = occ_dict.get(code, code) value = r["values"][0] # raw string records.append({"code_4": code, "occupation": name, "year": year, "value": value}) df = pd.DataFrame(records) df = df[df["code_4"] != "0002"].reset_index(drop=True) ## remove unidentified group # Drop missing values before weighting df = df.dropna(subset=["value"]).reset_index(drop=True) # Code formatting df["code_4"] = df["code_4"].astype(str).str.zfill(4) df["code_3"] = df["code_4"].str[:3] df["code_2"] = df["code_4"].str[:2] df["code_1"] = df["code_4"].str[0] df["value"] = df["value"].astype(int) # Create aggregated totals automatically for level in ["code_1", "code_2", "code_3"]: df[f"value_{level[-1]}"] = ( df.groupby(["year", level])["value"].transform("sum").astype(int) ) # weighting per the next broader group df["weight_in_code_3"] = (df["value"] / df["value_3"]).round(6) df["weight_in_code_2"] = (df["value_3"] / df["value_2"]).round(6) df["weight_in_code_1"] = (df["value_2"] / df["value_1"]).round(6) ## REMOVE EXTRA COLUMNS AND PREPARE DATA FOR NEXT STAGE df = df[ [ "code_4", "occupation", "value", "weight_in_code_3", "weight_in_code_2", "weight_in_code_1", ] ] # sanity check out_path = ROOT / "data" / "scb_weights" / f"{TAX_ID}_weights_en_{latest_year}.csv" out_path.parent.mkdir(parents=True, exist_ok=True) df.to_csv(out_path, index=False) print(f"Wrote: {out_path.resolve()}")