bdv / src /data /preprocess.py
stephmnt's picture
Sync from GitHub Actions
46f9144 verified
from __future__ import annotations
import argparse
import json
import logging
from pathlib import Path
from typing import Any, Dict, Iterable, Mapping
import pandas as pd
from src import data_prep
LOGGER = logging.getLogger(__name__)
DEFAULT_META_CONFIG: Dict[str, Dict[str, Any]] = {
"14_EU.csv": {
"type_scrutin": "europeennes",
"date_scrutin": "2014-05-25",
"tour_column": "N° tour",
"code_bv_cols": ["Code de la commune", "N° de bureau de vote"],
"rename_map": {
"Inscrits": "inscrits",
"Votants": "votants",
"Exprimés": "exprimes",
"Exprimés": "exprimes",
"Nombre de voix du candidat": "voix",
"Voix": "voix",
"Nom du candidat": "nom_candidature",
"Prénom du candidat": "nom_candidature",
"Code nuance du candidat": "code_candidature",
},
},
"14_MN14_T1T2.csv": {
"type_scrutin": "municipales",
"date_scrutin": "2014-03-23",
"tour_column": "N° tour",
"code_bv_cols": ["Code commune", "N° de bureau de vote"],
"rename_map": {
"Inscrits": "inscrits",
"Votants": "votants",
"Exprimés": "exprimes",
"Nombre de voix": "voix",
"Nom du candidat tête de liste": "nom_candidature",
"Prénom du candidat tête de liste": "nom_candidature",
"Code nuance de la liste": "code_candidature",
},
},
"17_L_T1.csv": {
"type_scrutin": "legislatives",
"date_scrutin": "2017-06-11",
"tour": 1,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nuance": "code_candidature",
"Nom": "nom_candidature",
},
},
"17_L_T2.csv": {
"type_scrutin": "legislatives",
"date_scrutin": "2017-06-18",
"tour": 2,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nuance": "code_candidature",
"Nom": "nom_candidature",
},
},
"17_PR_T1.csv": {
"type_scrutin": "presidentielles",
"date_scrutin": "2017-04-23",
"tour": 1,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nom": "nom_candidature",
"Code nuance du candidat": "code_candidature",
},
},
"17_PR_T2.csv": {
"type_scrutin": "presidentielles",
"date_scrutin": "2017-05-07",
"tour": 2,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nom": "nom_candidature",
"Code nuance du candidat": "code_candidature",
},
},
"19_EU.csv": {
"type_scrutin": "europeennes",
"date_scrutin": "2019-05-26",
"tour": 1,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nom Tête de Liste": "nom_candidature",
"Nuance Liste": "code_candidature",
},
},
"20_MN_T1.csv": {
"type_scrutin": "municipales",
"date_scrutin": "2020-03-15",
"tour": 1,
"sep": ";",
"code_bv_cols": ["Code de la commune", "Code B.Vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nom": "nom_candidature",
"Liste": "nom_candidature",
"Code Nuance": "code_candidature",
},
},
"20_MN_T2.csv": {
"type_scrutin": "municipales",
"date_scrutin": "2020-06-28",
"tour": 2,
"code_bv_cols": ["Code de la commune", "Code B.Vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nom": "nom_candidature",
"Liste": "nom_candidature",
"Code Nuance": "code_candidature",
},
},
"21_DEP_T1.csv": {
"type_scrutin": "departementales",
"date_scrutin": "2021-06-20",
"tour": 1,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nuance": "code_candidature",
"Binôme": "nom_candidature",
},
},
"21_DEP_T2.csv": {
"type_scrutin": "departementales",
"date_scrutin": "2021-06-27",
"tour": 2,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nuance": "code_candidature",
"Binôme": "nom_candidature",
},
},
"21_REG_T1.csv": {
"type_scrutin": "regionales",
"date_scrutin": "2021-06-20",
"tour": 1,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nuance Liste": "code_candidature",
"Libellé Abrégé Liste": "nom_candidature",
},
},
"21_REG_T2.csv": {
"type_scrutin": "regionales",
"date_scrutin": "2021-06-27",
"tour": 2,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nuance Liste": "code_candidature",
"Libellé Abrégé Liste": "nom_candidature",
},
},
"22_L_T1.csv": {
"type_scrutin": "legislatives",
"date_scrutin": "2022-06-12",
"tour": 1,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nuance": "code_candidature",
"Nom": "nom_candidature",
},
},
"22_L_T2.csv": {
"type_scrutin": "legislatives",
"date_scrutin": "2022-06-19",
"tour": 2,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nuance": "code_candidature",
"Nom": "nom_candidature",
},
},
"22_PR_T1.csv": {
"type_scrutin": "presidentielles",
"date_scrutin": "2022-04-10",
"tour": 1,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nom": "nom_candidature",
"Code nuance du candidat": "code_candidature",
},
},
"22_PR_T2.csv": {
"type_scrutin": "presidentielles",
"date_scrutin": "2022-04-24",
"tour": 2,
"code_bv_cols": ["Code de la commune", "Code du b.vote"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nom": "nom_candidature",
"Code nuance du candidat": "code_candidature",
},
},
"24_EU.csv": {
"type_scrutin": "europeennes",
"date_scrutin": "2024-06-09",
"tour": 1,
"code_bv_cols": ["Code commune", "Code BV"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix 1": "voix",
"Voix": "voix",
"Nuance liste 1": "code_candidature",
"Libellé abrégé de liste 1": "nom_candidature",
},
},
"24_L_T1.csv": {
"type_scrutin": "legislatives",
"date_scrutin": "2024-06-30",
"tour": 1,
"code_bv_cols": ["Code commune", "Code BV"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nuance Liste": "code_candidature",
"Libellé Abrégé Liste": "nom_candidature",
"Binôme": "nom_candidature",
},
},
"24_L_T2.csv": {
"type_scrutin": "legislatives",
"date_scrutin": "2024-07-07",
"tour": 2,
"code_bv_cols": ["Code commune", "Code BV"],
"rename_map": {
"Inscrits": "inscrits",
"Abstentions": "abstentions",
"Votants": "votants",
"Blancs": "blancs",
"Nuls": "nuls",
"Exprimés": "exprimes",
"Voix": "voix",
"Nuance Liste": "code_candidature",
"Libellé Abrégé Liste": "nom_candidature",
"Binôme": "nom_candidature",
},
},
}
DEFAULT_META_CONFIG_PATH = Path(__file__).resolve().parents[2] / "config" / "raw_sources.yaml"
def _resolve_meta_config(raw: Mapping[str, Mapping[str, Any]]) -> Dict[str, Dict[str, Any]]:
resolved: Dict[str, Dict[str, Any]] = {}
def resolve_one(key: str, stack: list[str]) -> Dict[str, Any]:
if key in resolved:
return resolved[key]
if key in stack:
raise ValueError(f"Cycle detecte dans meta-config: {' -> '.join(stack + [key])}")
meta = dict(raw[key])
base_key = meta.pop("copy_from", None)
if base_key:
if base_key not in raw:
raise KeyError(f"copy_from cible introuvable: {base_key}")
base = resolve_one(base_key, stack + [key])
merged = dict(base)
rename_base = dict(base.get("rename_map", {}))
rename_override = dict(meta.get("rename_map", {}))
merged.update(meta)
if rename_base or rename_override:
merged["rename_map"] = {**rename_base, **rename_override}
resolved[key] = merged
else:
resolved[key] = meta
return resolved[key]
for name in raw:
resolve_one(name, [])
return resolved
def load_meta_config(meta_path: Path | None) -> Dict[str, Dict[str, Any]]:
if meta_path is None:
if DEFAULT_META_CONFIG_PATH.exists():
meta_path = DEFAULT_META_CONFIG_PATH
else:
return DEFAULT_META_CONFIG
if not meta_path.exists():
raise FileNotFoundError(f"Meta-config file not found: {meta_path}")
if meta_path.suffix in {".yml", ".yaml"}:
try:
import yaml
except Exception as exc:
raise RuntimeError("PyYAML is required to read YAML meta-config files.") from exc
raw = yaml.safe_load(meta_path.read_text()) or {}
else:
raw = json.loads(meta_path.read_text())
if not isinstance(raw, dict):
raise ValueError("Meta-config invalide: attendu un mapping de fichiers vers meta-donnees.")
return _resolve_meta_config(raw)
def preprocess_all(raw_dir: Path, output_dir: Path, meta_config: Mapping[str, Mapping[str, Any]]) -> pd.DataFrame:
frames = []
missing: list[str] = []
for file_name, meta in meta_config.items():
path = raw_dir / file_name
if not path.exists():
missing.append(file_name)
continue
LOGGER.info("Standardisation de %s", file_name)
df_std = data_prep.standardize_election(
path,
meta,
rename_map=meta.get("rename_map", {}),
sep=meta.get("sep", ";"),
encoding=meta.get("encoding", ("cp1252", "utf-8-sig", "latin-1")),
decimal=meta.get("decimal", ","),
) # type: ignore[arg-type]
frames.append(df_std)
if missing:
LOGGER.warning("Fichiers manquants ignorés: %s", ", ".join(sorted(missing)))
if not frames:
raise RuntimeError("Aucune donnée chargée : vérifier le dossier raw et la configuration meta.")
elections_long = pd.concat(frames, ignore_index=True)
elections_long["date_scrutin"] = pd.to_datetime(elections_long["date_scrutin"])
elections_long["annee"] = elections_long["date_scrutin"].dt.year
elections_long["type_scrutin"] = elections_long["type_scrutin"].str.lower()
elections_long["code_commune"] = elections_long["code_bv"].astype(str).str.split("-").str[0]
issues = data_prep.validate_consistency(elections_long)
for name, df_issue in issues.items():
if len(df_issue) > 0:
LOGGER.warning("%s : %s lignes a inspecter", name, len(df_issue))
output_dir.mkdir(parents=True, exist_ok=True)
parquet_path = output_dir / "elections_long.parquet"
csv_path = output_dir / "elections_long.csv"
elections_long.to_parquet(parquet_path, index=False)
elections_long.to_csv(csv_path, sep=";", index=False)
LOGGER.info("Long format sauvegarde (%s lignes) -> %s / %s", len(elections_long), parquet_path, csv_path)
return elections_long
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Prétraitement des fichiers bruts en format long standardisé.")
parser.add_argument("--raw-dir", type=Path, default=Path("data/raw"), help="Répertoire des fichiers bruts CSV.")
parser.add_argument("--output-dir", type=Path, default=Path("data/interim"), help="Destination du format long harmonisé.")
parser.add_argument(
"--meta-config",
type=Path,
default=None,
help="Chemin vers un fichier JSON/YAML décrivant les meta-données des scrutins. Par défaut, utilise la configuration embarquée.",
)
return parser.parse_args()
def main() -> None:
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
args = parse_args()
meta_config = load_meta_config(args.meta_config)
preprocess_all(args.raw_dir, args.output_dir, meta_config)
if __name__ == "__main__":
main()