Spaces:

joseph-data
/

daioe

Sleeping

File size: 3,809 Bytes

eec71cb
c2e1a4b
eec71cb
 
 
 
 
 
 
 
e6d6708
 
eec71cb
 
c2e1a4b
e6d6708
c2e1a4b
 
 
 
 
 
e6d6708
c2e1a4b
 
 
 
 
 
e6d6708
c2e1a4b
 
 
 
 
 
e6d6708
 
c2e1a4b
 
 
 
 
 
e6d6708
 
 
 
 
 
c2e1a4b
 
 
 
 
 
eec71cb
c2e1a4b
 
e6d6708
 
c2e1a4b
 
 
 
 
eec71cb
c2e1a4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6d6708
 
 
c2e1a4b

"""SCB fetch helper: pull employment counts for SSYK taxonomies.

Main entry point:
- `fetch_taxonomy_dataframe(taxonomy)` returns `(df, year_used)`.

Output schema (tidy):
- `taxonomy`, `year`, `level`, `code`, `value`

CLI:
- `python -m src.scb_fetch --taxonomy ssyk2012`
"""

from __future__ import annotations

import argparse
from typing import Literal, Tuple

import pandas as pd
from pyscbwrapper import SCB

Taxonomy = Literal["ssyk2012", "ssyk96"]

# SCB table metadata keyed by taxonomy.
TABLES = {
    "ssyk2012": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"),
    "ssyk96": ("en", "AM", "AM0208", "AM0208E", "YREG33"),
}


def _coerce_year(value: str | int | None) -> int | None:
    try:
        return int(value) if value is not None else None
    except (TypeError, ValueError):
        return None


def _latest_year(var_block: dict) -> str:
    years = [_coerce_year(year) for year in var_block.get("year", [])]
    valid = [year for year in years if year is not None]
    if not valid:
        raise ValueError("SCB variable metadata did not provide any valid years")
    return str(max(valid))


def fetch_taxonomy_dataframe(taxonomy: Taxonomy) -> Tuple[pd.DataFrame, str]:
    """
    Pull SCB employment counts for a taxonomy and return a tidy DataFrame.

    Returns a tuple of (dataframe, year_used). No disk writes occur.
    """
    if taxonomy not in TABLES:
        raise KeyError(f"Unknown taxonomy '{taxonomy}'")

    scb = SCB(*TABLES[taxonomy])
    var_block = scb.get_variables()
    occupations_key, occupations = next(iter(var_block.items()))
    # SCB variable names sometimes contain spaces; query keys cannot.
    clean_key = occupations_key.replace(" ", "")

    year = _latest_year(var_block)
    # Request all occupations for the freshest year exposed by the API.
    scb.set_query(**{clean_key: occupations, "year": [year]})
    scb_fetch = scb.get_data()["data"]

    records = []
    for record in scb_fetch:
        # SCB records encode the occupation code + year in the `key` field.
        code, obs_year = record["key"][:2]
        if code == "0002":
            continue  # drop unspecified bucket
        value = int(record["values"][0])
        records.append(
            {
                "code_4": str(code).zfill(4),
                "code_3": str(code).zfill(4)[:3],
                "code_2": str(code).zfill(4)[:2],
                "code_1": str(code).zfill(4)[:1],
                "year": obs_year,
                "value": value,
            }
        )

    df = pd.DataFrame(records)
    if df.empty:
        raise RuntimeError(f"SCB returned no data for taxonomy '{taxonomy}'")

    level_map = {4: "code_4", 3: "code_3", 2: "code_2", 1: "code_1"}
    frames = []
    for level, column in level_map.items():
        level_df = (
            df.groupby(["year", column], as_index=False)["value"]
            .sum()
            .rename(columns={column: "code"})
        )
        level_df["level"] = level
        frames.append(level_df)

    stacked = (
        pd.concat(frames, ignore_index=True)
        .assign(taxonomy=taxonomy)[["taxonomy", "year", "level", "code", "value"]]
        .sort_values(["year", "level", "code"], ignore_index=True)
    )

    return stacked, year


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Pull SCB weights for a taxonomy")
    parser.add_argument(
        "--taxonomy",
        default="ssyk2012",
        choices=["ssyk2012", "ssyk96"],
        help="Taxonomy to download (default: ssyk2012)",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    df, year = fetch_taxonomy_dataframe(args.taxonomy)
    print(f"Fetched {len(df)} rows for {args.taxonomy} (year {year})")
    print(df.head())


if __name__ == "__main__":
    main()