Spaces:
Running
Running
| """ | |
| MODULE: PIPELINE BUILDER (FINAL FIX - TIME SUPPORT) | |
| =================================================== | |
| Responsabilité : | |
| 1. Nettoyer les données (Currency, Int, Date, DateTime). | |
| 2. Localiser précisément les erreurs (Lineage Ligne/Colonne). | |
| 3. Support étendu des formats de date (Heures incluses). | |
| """ | |
| import re | |
| import pandas as pd | |
| from collections import defaultdict | |
| from datetime import datetime | |
| class PipelineBuilder: | |
| def __init__(self): | |
| self.stats = defaultdict(int) | |
| self.logs = [] | |
| def get_health_report(self): | |
| return dict(self.stats) | |
| def enforce_contract(self, value, target_type, row_id="N/A", col_name="N/A"): | |
| # 1. Gestion Nulls | |
| if pd.isna(value) or str(value).strip().lower() in ['nan', 'none', '', 'null']: | |
| self.stats["missing_values"] += 1 | |
| return None | |
| val_str = str(value).strip() | |
| try: | |
| cleaned = None | |
| # 2. Routage par type | |
| if target_type in ["xsd:decimal", "xsd:float"]: | |
| cleaned = self._clean_currency(val_str) | |
| elif target_type == "xsd:integer": | |
| cleaned = self._clean_integer(val_str) | |
| elif target_type == "xsd:date": | |
| # On force le format YYYY-MM-DD (sans heure) | |
| cleaned = self._standardize_date(val_str, want_time=False) | |
| elif target_type == "xsd:dateTime": | |
| # On garde l'heure si elle existe | |
| cleaned = self._standardize_date(val_str, want_time=True) | |
| elif target_type == "xsd:boolean": | |
| cleaned = self._to_boolean(val_str) | |
| else: | |
| cleaned = self._clean_text(val_str) | |
| self.stats["valid_entries"] += 1 | |
| return cleaned | |
| except ValueError: | |
| # 3. Capture de l'erreur pour le guide de correction | |
| self.stats["rejected_contracts"] += 1 | |
| self.logs.append({ | |
| "📍 Ligne (ID)": str(row_id), | |
| "📌 Colonne": str(col_name), | |
| "❌ Valeur": val_str, | |
| "⚠️ Attendu": target_type, | |
| "Raison": "Format incompatible" | |
| }) | |
| return None | |
| # --- NETTOYAGE --- | |
| def _clean_currency(self, val): | |
| clean = re.sub(r'[^\d,.-]', '', val).replace(',', '.') | |
| if not clean: raise ValueError("Vide après nettoyage") | |
| return float(clean) | |
| def _clean_integer(self, val): | |
| return int(self._clean_currency(val)) | |
| def _standardize_date(self, val, want_time=False): | |
| # Si on veut juste une date mais qu'on reçoit "YYYY-MM-DD HH:MM:SS", on coupe | |
| if not want_time and " " in val: | |
| val = val.split(" ")[0] | |
| # Liste étendue des formats supportés (C'EST LA CLÉ DU SUCCÈS) | |
| formats = [ | |
| ("%d/%m/%Y", "%Y-%m-%d"), # 01/03/2026 | |
| ("%d-%m-%Y", "%Y-%m-%d"), # 01-03-2026 | |
| ("%Y-%m-%d", "%Y-%m-%d"), # 2026-03-01 | |
| # Formats avec Heure (Pour éviter le crash des logs) | |
| ("%d-%m-%Y %H:%M:%S", "%Y-%m-%dT%H:%M:%S"), # 13-01-2026 23:40:03 | |
| ("%d/%m/%Y %H:%M:%S", "%Y-%m-%dT%H:%M:%S"), | |
| ("%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S") | |
| ] | |
| for fmt_in, fmt_out in formats: | |
| try: | |
| dt = datetime.strptime(val, fmt_in) | |
| if not want_time: | |
| return dt.strftime("%Y-%m-%d") | |
| return dt.strftime(fmt_out) | |
| except ValueError: | |
| continue | |
| raise ValueError(f"Date invalide: {val}") | |
| def _to_boolean(self, val): | |
| return val.lower() in ['true', '1', 'oui', 'yes', 'vrai', 'active', 'actif'] | |
| def _clean_text(self, val): | |
| return re.sub(r'\s+', ' ', val).strip() |