Spaces:
Sleeping
Sleeping
| """SCB fetch helper: pull employment counts for SSYK taxonomies. | |
| Main entry point: | |
| - `fetch_taxonomy_dataframe(taxonomy)` returns `(df, year_used)`. | |
| Output schema (tidy): | |
| - `taxonomy`, `year`, `level`, `code`, `value` | |
| CLI: | |
| - `python -m src.scb_fetch --taxonomy ssyk2012` | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| from typing import Literal, Tuple | |
| import pandas as pd | |
| from pyscbwrapper import SCB | |
| Taxonomy = Literal["ssyk2012", "ssyk96"] | |
| # SCB table metadata keyed by taxonomy. | |
| TABLES = { | |
| "ssyk2012": ("en", "AM", "AM0208", "AM0208E", "YREG51BAS"), | |
| "ssyk96": ("en", "AM", "AM0208", "AM0208E", "YREG33"), | |
| } | |
| def _coerce_year(value: str | int | None) -> int | None: | |
| try: | |
| return int(value) if value is not None else None | |
| except (TypeError, ValueError): | |
| return None | |
| def _latest_year(var_block: dict) -> str: | |
| years = [_coerce_year(year) for year in var_block.get("year", [])] | |
| valid = [year for year in years if year is not None] | |
| if not valid: | |
| raise ValueError("SCB variable metadata did not provide any valid years") | |
| return str(max(valid)) | |
| def fetch_taxonomy_dataframe(taxonomy: Taxonomy) -> Tuple[pd.DataFrame, str]: | |
| """ | |
| Pull SCB employment counts for a taxonomy and return a tidy DataFrame. | |
| Returns a tuple of (dataframe, year_used). No disk writes occur. | |
| """ | |
| if taxonomy not in TABLES: | |
| raise KeyError(f"Unknown taxonomy '{taxonomy}'") | |
| scb = SCB(*TABLES[taxonomy]) | |
| var_block = scb.get_variables() | |
| occupations_key, occupations = next(iter(var_block.items())) | |
| # SCB variable names sometimes contain spaces; query keys cannot. | |
| clean_key = occupations_key.replace(" ", "") | |
| year = _latest_year(var_block) | |
| # Request all occupations for the freshest year exposed by the API. | |
| scb.set_query(**{clean_key: occupations, "year": [year]}) | |
| scb_fetch = scb.get_data()["data"] | |
| records = [] | |
| for record in scb_fetch: | |
| # SCB records encode the occupation code + year in the `key` field. | |
| code, obs_year = record["key"][:2] | |
| if code == "0002": | |
| continue # drop unspecified bucket | |
| value = int(record["values"][0]) | |
| records.append( | |
| { | |
| "code_4": str(code).zfill(4), | |
| "code_3": str(code).zfill(4)[:3], | |
| "code_2": str(code).zfill(4)[:2], | |
| "code_1": str(code).zfill(4)[:1], | |
| "year": obs_year, | |
| "value": value, | |
| } | |
| ) | |
| df = pd.DataFrame(records) | |
| if df.empty: | |
| raise RuntimeError(f"SCB returned no data for taxonomy '{taxonomy}'") | |
| level_map = {4: "code_4", 3: "code_3", 2: "code_2", 1: "code_1"} | |
| frames = [] | |
| for level, column in level_map.items(): | |
| level_df = ( | |
| df.groupby(["year", column], as_index=False)["value"] | |
| .sum() | |
| .rename(columns={column: "code"}) | |
| ) | |
| level_df["level"] = level | |
| frames.append(level_df) | |
| stacked = ( | |
| pd.concat(frames, ignore_index=True) | |
| .assign(taxonomy=taxonomy)[["taxonomy", "year", "level", "code", "value"]] | |
| .sort_values(["year", "level", "code"], ignore_index=True) | |
| ) | |
| return stacked, year | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Pull SCB weights for a taxonomy") | |
| parser.add_argument( | |
| "--taxonomy", | |
| default="ssyk2012", | |
| choices=["ssyk2012", "ssyk96"], | |
| help="Taxonomy to download (default: ssyk2012)", | |
| ) | |
| return parser.parse_args() | |
| def main() -> None: | |
| args = parse_args() | |
| df, year = fetch_taxonomy_dataframe(args.taxonomy) | |
| print(f"Fetched {len(df)} rows for {args.taxonomy} (year {year})") | |
| print(df.head()) | |
| if __name__ == "__main__": | |
| main() | |