|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
import logging |
|
|
from pathlib import Path |
|
|
from typing import Any, Dict, Iterable, Mapping |
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
from src import data_prep |
|
|
|
|
|
LOGGER = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
DEFAULT_META_CONFIG: Dict[str, Dict[str, Any]] = { |
|
|
"14_EU.csv": { |
|
|
"type_scrutin": "europeennes", |
|
|
"date_scrutin": "2014-05-25", |
|
|
"tour_column": "N° tour", |
|
|
"code_bv_cols": ["Code de la commune", "N° de bureau de vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Votants": "votants", |
|
|
"Exprimés": "exprimes", |
|
|
"Exprimés": "exprimes", |
|
|
"Nombre de voix du candidat": "voix", |
|
|
"Voix": "voix", |
|
|
"Nom du candidat": "nom_candidature", |
|
|
"Prénom du candidat": "nom_candidature", |
|
|
"Code nuance du candidat": "code_candidature", |
|
|
}, |
|
|
}, |
|
|
"14_MN14_T1T2.csv": { |
|
|
"type_scrutin": "municipales", |
|
|
"date_scrutin": "2014-03-23", |
|
|
"tour_column": "N° tour", |
|
|
"code_bv_cols": ["Code commune", "N° de bureau de vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Votants": "votants", |
|
|
"Exprimés": "exprimes", |
|
|
"Nombre de voix": "voix", |
|
|
"Nom du candidat tête de liste": "nom_candidature", |
|
|
"Prénom du candidat tête de liste": "nom_candidature", |
|
|
"Code nuance de la liste": "code_candidature", |
|
|
}, |
|
|
}, |
|
|
"17_L_T1.csv": { |
|
|
"type_scrutin": "legislatives", |
|
|
"date_scrutin": "2017-06-11", |
|
|
"tour": 1, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nuance": "code_candidature", |
|
|
"Nom": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
"17_L_T2.csv": { |
|
|
"type_scrutin": "legislatives", |
|
|
"date_scrutin": "2017-06-18", |
|
|
"tour": 2, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nuance": "code_candidature", |
|
|
"Nom": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
"17_PR_T1.csv": { |
|
|
"type_scrutin": "presidentielles", |
|
|
"date_scrutin": "2017-04-23", |
|
|
"tour": 1, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nom": "nom_candidature", |
|
|
"Code nuance du candidat": "code_candidature", |
|
|
}, |
|
|
}, |
|
|
"17_PR_T2.csv": { |
|
|
"type_scrutin": "presidentielles", |
|
|
"date_scrutin": "2017-05-07", |
|
|
"tour": 2, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nom": "nom_candidature", |
|
|
"Code nuance du candidat": "code_candidature", |
|
|
}, |
|
|
}, |
|
|
"19_EU.csv": { |
|
|
"type_scrutin": "europeennes", |
|
|
"date_scrutin": "2019-05-26", |
|
|
"tour": 1, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nom Tête de Liste": "nom_candidature", |
|
|
"Nuance Liste": "code_candidature", |
|
|
}, |
|
|
}, |
|
|
"20_MN_T1.csv": { |
|
|
"type_scrutin": "municipales", |
|
|
"date_scrutin": "2020-03-15", |
|
|
"tour": 1, |
|
|
"sep": ";", |
|
|
"code_bv_cols": ["Code de la commune", "Code B.Vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nom": "nom_candidature", |
|
|
"Liste": "nom_candidature", |
|
|
"Code Nuance": "code_candidature", |
|
|
}, |
|
|
}, |
|
|
"20_MN_T2.csv": { |
|
|
"type_scrutin": "municipales", |
|
|
"date_scrutin": "2020-06-28", |
|
|
"tour": 2, |
|
|
"code_bv_cols": ["Code de la commune", "Code B.Vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nom": "nom_candidature", |
|
|
"Liste": "nom_candidature", |
|
|
"Code Nuance": "code_candidature", |
|
|
}, |
|
|
}, |
|
|
"21_DEP_T1.csv": { |
|
|
"type_scrutin": "departementales", |
|
|
"date_scrutin": "2021-06-20", |
|
|
"tour": 1, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nuance": "code_candidature", |
|
|
"Binôme": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
"21_DEP_T2.csv": { |
|
|
"type_scrutin": "departementales", |
|
|
"date_scrutin": "2021-06-27", |
|
|
"tour": 2, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nuance": "code_candidature", |
|
|
"Binôme": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
"21_REG_T1.csv": { |
|
|
"type_scrutin": "regionales", |
|
|
"date_scrutin": "2021-06-20", |
|
|
"tour": 1, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nuance Liste": "code_candidature", |
|
|
"Libellé Abrégé Liste": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
"21_REG_T2.csv": { |
|
|
"type_scrutin": "regionales", |
|
|
"date_scrutin": "2021-06-27", |
|
|
"tour": 2, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nuance Liste": "code_candidature", |
|
|
"Libellé Abrégé Liste": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
"22_L_T1.csv": { |
|
|
"type_scrutin": "legislatives", |
|
|
"date_scrutin": "2022-06-12", |
|
|
"tour": 1, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nuance": "code_candidature", |
|
|
"Nom": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
"22_L_T2.csv": { |
|
|
"type_scrutin": "legislatives", |
|
|
"date_scrutin": "2022-06-19", |
|
|
"tour": 2, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nuance": "code_candidature", |
|
|
"Nom": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
"22_PR_T1.csv": { |
|
|
"type_scrutin": "presidentielles", |
|
|
"date_scrutin": "2022-04-10", |
|
|
"tour": 1, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nom": "nom_candidature", |
|
|
"Code nuance du candidat": "code_candidature", |
|
|
}, |
|
|
}, |
|
|
"22_PR_T2.csv": { |
|
|
"type_scrutin": "presidentielles", |
|
|
"date_scrutin": "2022-04-24", |
|
|
"tour": 2, |
|
|
"code_bv_cols": ["Code de la commune", "Code du b.vote"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nom": "nom_candidature", |
|
|
"Code nuance du candidat": "code_candidature", |
|
|
}, |
|
|
}, |
|
|
"24_EU.csv": { |
|
|
"type_scrutin": "europeennes", |
|
|
"date_scrutin": "2024-06-09", |
|
|
"tour": 1, |
|
|
"code_bv_cols": ["Code commune", "Code BV"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix 1": "voix", |
|
|
"Voix": "voix", |
|
|
"Nuance liste 1": "code_candidature", |
|
|
"Libellé abrégé de liste 1": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
"24_L_T1.csv": { |
|
|
"type_scrutin": "legislatives", |
|
|
"date_scrutin": "2024-06-30", |
|
|
"tour": 1, |
|
|
"code_bv_cols": ["Code commune", "Code BV"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nuance Liste": "code_candidature", |
|
|
"Libellé Abrégé Liste": "nom_candidature", |
|
|
"Binôme": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
"24_L_T2.csv": { |
|
|
"type_scrutin": "legislatives", |
|
|
"date_scrutin": "2024-07-07", |
|
|
"tour": 2, |
|
|
"code_bv_cols": ["Code commune", "Code BV"], |
|
|
"rename_map": { |
|
|
"Inscrits": "inscrits", |
|
|
"Abstentions": "abstentions", |
|
|
"Votants": "votants", |
|
|
"Blancs": "blancs", |
|
|
"Nuls": "nuls", |
|
|
"Exprimés": "exprimes", |
|
|
"Voix": "voix", |
|
|
"Nuance Liste": "code_candidature", |
|
|
"Libellé Abrégé Liste": "nom_candidature", |
|
|
"Binôme": "nom_candidature", |
|
|
}, |
|
|
}, |
|
|
} |
|
|
|
|
|
DEFAULT_META_CONFIG_PATH = Path(__file__).resolve().parents[2] / "config" / "raw_sources.yaml" |
|
|
|
|
|
|
|
|
def _resolve_meta_config(raw: Mapping[str, Mapping[str, Any]]) -> Dict[str, Dict[str, Any]]: |
|
|
resolved: Dict[str, Dict[str, Any]] = {} |
|
|
|
|
|
def resolve_one(key: str, stack: list[str]) -> Dict[str, Any]: |
|
|
if key in resolved: |
|
|
return resolved[key] |
|
|
if key in stack: |
|
|
raise ValueError(f"Cycle detecte dans meta-config: {' -> '.join(stack + [key])}") |
|
|
meta = dict(raw[key]) |
|
|
base_key = meta.pop("copy_from", None) |
|
|
if base_key: |
|
|
if base_key not in raw: |
|
|
raise KeyError(f"copy_from cible introuvable: {base_key}") |
|
|
base = resolve_one(base_key, stack + [key]) |
|
|
merged = dict(base) |
|
|
rename_base = dict(base.get("rename_map", {})) |
|
|
rename_override = dict(meta.get("rename_map", {})) |
|
|
merged.update(meta) |
|
|
if rename_base or rename_override: |
|
|
merged["rename_map"] = {**rename_base, **rename_override} |
|
|
resolved[key] = merged |
|
|
else: |
|
|
resolved[key] = meta |
|
|
return resolved[key] |
|
|
|
|
|
for name in raw: |
|
|
resolve_one(name, []) |
|
|
return resolved |
|
|
|
|
|
|
|
|
def load_meta_config(meta_path: Path | None) -> Dict[str, Dict[str, Any]]: |
|
|
if meta_path is None: |
|
|
if DEFAULT_META_CONFIG_PATH.exists(): |
|
|
meta_path = DEFAULT_META_CONFIG_PATH |
|
|
else: |
|
|
return DEFAULT_META_CONFIG |
|
|
if not meta_path.exists(): |
|
|
raise FileNotFoundError(f"Meta-config file not found: {meta_path}") |
|
|
if meta_path.suffix in {".yml", ".yaml"}: |
|
|
try: |
|
|
import yaml |
|
|
except Exception as exc: |
|
|
raise RuntimeError("PyYAML is required to read YAML meta-config files.") from exc |
|
|
raw = yaml.safe_load(meta_path.read_text()) or {} |
|
|
else: |
|
|
raw = json.loads(meta_path.read_text()) |
|
|
if not isinstance(raw, dict): |
|
|
raise ValueError("Meta-config invalide: attendu un mapping de fichiers vers meta-donnees.") |
|
|
return _resolve_meta_config(raw) |
|
|
|
|
|
|
|
|
def preprocess_all(raw_dir: Path, output_dir: Path, meta_config: Mapping[str, Mapping[str, Any]]) -> pd.DataFrame: |
|
|
frames = [] |
|
|
missing: list[str] = [] |
|
|
for file_name, meta in meta_config.items(): |
|
|
path = raw_dir / file_name |
|
|
if not path.exists(): |
|
|
missing.append(file_name) |
|
|
continue |
|
|
LOGGER.info("Standardisation de %s", file_name) |
|
|
df_std = data_prep.standardize_election( |
|
|
path, |
|
|
meta, |
|
|
rename_map=meta.get("rename_map", {}), |
|
|
sep=meta.get("sep", ";"), |
|
|
encoding=meta.get("encoding", ("cp1252", "utf-8-sig", "latin-1")), |
|
|
decimal=meta.get("decimal", ","), |
|
|
) |
|
|
frames.append(df_std) |
|
|
if missing: |
|
|
LOGGER.warning("Fichiers manquants ignorés: %s", ", ".join(sorted(missing))) |
|
|
if not frames: |
|
|
raise RuntimeError("Aucune donnée chargée : vérifier le dossier raw et la configuration meta.") |
|
|
|
|
|
elections_long = pd.concat(frames, ignore_index=True) |
|
|
elections_long["date_scrutin"] = pd.to_datetime(elections_long["date_scrutin"]) |
|
|
elections_long["annee"] = elections_long["date_scrutin"].dt.year |
|
|
elections_long["type_scrutin"] = elections_long["type_scrutin"].str.lower() |
|
|
elections_long["code_commune"] = elections_long["code_bv"].astype(str).str.split("-").str[0] |
|
|
|
|
|
issues = data_prep.validate_consistency(elections_long) |
|
|
for name, df_issue in issues.items(): |
|
|
if len(df_issue) > 0: |
|
|
LOGGER.warning("%s : %s lignes a inspecter", name, len(df_issue)) |
|
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
parquet_path = output_dir / "elections_long.parquet" |
|
|
csv_path = output_dir / "elections_long.csv" |
|
|
elections_long.to_parquet(parquet_path, index=False) |
|
|
elections_long.to_csv(csv_path, sep=";", index=False) |
|
|
LOGGER.info("Long format sauvegarde (%s lignes) -> %s / %s", len(elections_long), parquet_path, csv_path) |
|
|
return elections_long |
|
|
|
|
|
|
|
|
def parse_args() -> argparse.Namespace: |
|
|
parser = argparse.ArgumentParser(description="Prétraitement des fichiers bruts en format long standardisé.") |
|
|
parser.add_argument("--raw-dir", type=Path, default=Path("data/raw"), help="Répertoire des fichiers bruts CSV.") |
|
|
parser.add_argument("--output-dir", type=Path, default=Path("data/interim"), help="Destination du format long harmonisé.") |
|
|
parser.add_argument( |
|
|
"--meta-config", |
|
|
type=Path, |
|
|
default=None, |
|
|
help="Chemin vers un fichier JSON/YAML décrivant les meta-données des scrutins. Par défaut, utilise la configuration embarquée.", |
|
|
) |
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def main() -> None: |
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") |
|
|
args = parse_args() |
|
|
meta_config = load_meta_config(args.meta_config) |
|
|
preprocess_all(args.raw_dir, args.output_dir, meta_config) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|