File size: 4,347 Bytes
1264273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from __future__ import annotations

from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd
from pyscbwrapper import SCB

# ===================== #
# ---- CONFIG AREA ---- #
# ===================== #

TAX_ID = "ssyk2012"  # or "ssyk96"
TABLES: Dict[str, Tuple[str, ...]] = {
    "ssyk2012_tab": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"),
    "ssyk96_tab": ("en", "AM", "AM0208", "AM0208E", "YREG33"),
}


def get_project_root() -> Path:
    try:
        return Path(__file__).resolve().parents[1]
    except NameError:
        return Path.cwd().resolve().parents[1]


# ========================= #
# ---- CORE FUNCTIONS  ---- #
# ========================= #


def latest_year_from_vars(vars_: Dict[str, List[str]]) -> str:
    years = []
    for y in vars_.get("year", []):
        try:
            years.append(int(y))
        except Exception:
            pass
    return str(max(years))


def fetch_scb_metadata_and_year(tax_id: str):
    scb = SCB(*TABLES[f"{tax_id}_tab"])
    vars_ = scb.get_variables()
    occupations_key, occupations = next(iter(vars_.items()))
    clean_key = occupations_key.replace(" ", "")  # wrapper quirk
    latest_year = latest_year_from_vars(vars_)
    return scb, clean_key, occupations, latest_year


def fetch_data_for_year(
    scb: SCB, occ_key: str, occupations: List[str], year: str
) -> Tuple[List[dict], Dict[str, str]]:
    scb.set_query(**{occ_key: occupations, "year": [year]})
    scb_data = scb.get_data()
    scb_fetch = scb_data["data"]
    codes = scb.get_query()["query"][0]["selection"]["values"]
    occ_dict = dict(zip(codes, occupations))
    return scb_fetch, occ_dict


def build_weights_df(scb_fetch: List[dict], occ_dict: Dict[str, str]) -> pd.DataFrame:
    # raw records
    records = []
    for r in scb_fetch:
        code, year = r["key"][:2]  # occupation code, year
        name = occ_dict.get(code, code)
        value = r["values"][0]
        records.append(
            {"code_4": code, "occupation": name, "year": year, "value": value}
        )

    df = pd.DataFrame(records)
    df = df[df["code_4"] != "0002"].reset_index(drop=True)  ## remove unidentified group

    # keep year as string, codes as strings; value numeric
    df["year"] = df["year"].astype(str)
    df["value"] = pd.to_numeric(df["value"], errors="coerce")
    df = df.dropna(subset=["value"]).reset_index(drop=True)
    df["value"] = df["value"].astype(int)
    df = df[df["value"] > 0].reset_index(drop=True)

    # code hierarchy
    df["code_4"] = (
        df["code_4"]
        .astype(str)
        .str.replace(r"\D", "", regex=True)
        .str.slice(0, 4)
        .str.zfill(4)
    )
    df["code_3"] = df["code_4"].str[:3]
    df["code_2"] = df["code_4"].str[:2]
    df["code_1"] = df["code_4"].str[0]

    # aggregated totals per level (by year) — sums are ints; codes remain strings
    for level in ["code_1", "code_2", "code_3"]:
        df[f"value_{level[-1]}"] = (
            df.groupby(["year", level])["value"].transform("sum").astype(int)
        )

    # weights (shares) within each higher level
    df["weight_in_code_3"] = (df["value"] / df["value_3"]).round(6)
    df["weight_in_code_2"] = (df["value"] / df["value_2"]).round(6)
    df["weight_in_code_1"] = (df["value"] / df["value_1"]).round(6)

    # final columns for export
    df = df[
        [
            "code_4",
            "occupation",
            "value",
            "weight_in_code_3",
            "weight_in_code_2",
            "weight_in_code_1",
            "year",
        ]
    ]

    return df


def write_weights(df: pd.DataFrame, tax_id: str, year: str) -> Path:
    root = get_project_root()
    out_path = root / "data" / "scb_weights" / f"{tax_id}_weights_en_{year}.csv"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_path, index=False)
    return out_path


# ===================== #
# -------- CLI -------- #
# ===================== #


def main(tax_id: str = TAX_ID):
    scb, occ_key, occupations, latest_year = fetch_scb_metadata_and_year(tax_id)
    scb_fetch, occ_dict = fetch_data_for_year(scb, occ_key, occupations, latest_year)
    df = build_weights_df(scb_fetch, occ_dict)
    out_path = write_weights(df, tax_id, latest_year)
    print(f"✅ Wrote: {out_path.resolve()}")
    return df


if __name__ == "__main__":
    main()