Spaces:
Running
Running
Update src/Analytics/pipeline_builder.py
Browse files
src/Analytics/pipeline_builder.py
CHANGED
|
@@ -1,30 +1,26 @@
|
|
| 1 |
"""
|
| 2 |
-
MODULE: PIPELINE BUILDER (
|
| 3 |
-
=================================================
|
| 4 |
Responsabilité :
|
| 5 |
-
1. Nettoyer les données.
|
| 6 |
-
2.
|
| 7 |
-
3.
|
| 8 |
"""
|
| 9 |
import re
|
| 10 |
import pandas as pd
|
| 11 |
from collections import defaultdict
|
|
|
|
| 12 |
|
| 13 |
class PipelineBuilder:
|
| 14 |
def __init__(self):
|
| 15 |
-
# MONITORING : Compteurs globaux
|
| 16 |
self.stats = defaultdict(int)
|
| 17 |
-
# LOGS STRUCTURÉS : Pour le tableau de bord
|
| 18 |
self.logs = []
|
| 19 |
|
| 20 |
def get_health_report(self):
|
| 21 |
return dict(self.stats)
|
| 22 |
|
| 23 |
def enforce_contract(self, value, target_type, row_id="N/A", col_name="N/A"):
|
| 24 |
-
|
| 25 |
-
Applique le contrat. Si ça casse, on note QUI (row_id) et QUOI (col_name).
|
| 26 |
-
"""
|
| 27 |
-
# 1. Gestion des valeurs vides
|
| 28 |
if pd.isna(value) or str(value).strip().lower() in ['nan', 'none', '', 'null']:
|
| 29 |
self.stats["missing_values"] += 1
|
| 30 |
return None
|
|
@@ -34,7 +30,7 @@ class PipelineBuilder:
|
|
| 34 |
try:
|
| 35 |
cleaned = None
|
| 36 |
|
| 37 |
-
# 2. Routage
|
| 38 |
if target_type in ["xsd:decimal", "xsd:float"]:
|
| 39 |
cleaned = self._clean_currency(val_str)
|
| 40 |
|
|
@@ -42,7 +38,12 @@ class PipelineBuilder:
|
|
| 42 |
cleaned = self._clean_integer(val_str)
|
| 43 |
|
| 44 |
elif target_type == "xsd:date":
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
elif target_type == "xsd:boolean":
|
| 48 |
cleaned = self._to_boolean(val_str)
|
|
@@ -50,49 +51,59 @@ class PipelineBuilder:
|
|
| 50 |
else:
|
| 51 |
cleaned = self._clean_text(val_str)
|
| 52 |
|
| 53 |
-
# Succès
|
| 54 |
self.stats["valid_entries"] += 1
|
| 55 |
return cleaned
|
| 56 |
|
| 57 |
except ValueError:
|
| 58 |
-
#
|
| 59 |
self.stats["rejected_contracts"] += 1
|
| 60 |
-
|
| 61 |
-
error_report = {
|
| 62 |
"📍 Ligne (ID)": str(row_id),
|
| 63 |
"📌 Colonne": str(col_name),
|
| 64 |
-
"❌ Valeur
|
| 65 |
-
"⚠️
|
| 66 |
-
"
|
| 67 |
-
}
|
| 68 |
-
self.logs.append(error_report)
|
| 69 |
return None
|
| 70 |
|
| 71 |
-
# ---
|
| 72 |
def _clean_currency(self, val):
|
| 73 |
-
# Garde les chiffres, le point, la virgule et le moins
|
| 74 |
clean = re.sub(r'[^\d,.-]', '', val).replace(',', '.')
|
| 75 |
-
if not clean:
|
| 76 |
return float(clean)
|
| 77 |
|
| 78 |
def _clean_integer(self, val):
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
(
|
| 86 |
-
(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
]
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
raise ValueError(f"Date invalide: {val}")
|
| 93 |
|
| 94 |
def _to_boolean(self, val):
|
| 95 |
-
return val.lower() in ['true', '1', 'oui', 'yes', 'vrai', 'active']
|
| 96 |
|
| 97 |
def _clean_text(self, val):
|
| 98 |
return re.sub(r'\s+', ' ', val).strip()
|
|
|
|
| 1 |
"""
|
| 2 |
+
MODULE: PIPELINE BUILDER (FINAL FIX - TIME SUPPORT)
|
| 3 |
+
===================================================
|
| 4 |
Responsabilité :
|
| 5 |
+
1. Nettoyer les données (Currency, Int, Date, DateTime).
|
| 6 |
+
2. Localiser précisément les erreurs (Lineage Ligne/Colonne).
|
| 7 |
+
3. Support étendu des formats de date (Heures incluses).
|
| 8 |
"""
|
| 9 |
import re
|
| 10 |
import pandas as pd
|
| 11 |
from collections import defaultdict
|
| 12 |
+
from datetime import datetime
|
| 13 |
|
| 14 |
class PipelineBuilder:
|
| 15 |
def __init__(self):
|
|
|
|
| 16 |
self.stats = defaultdict(int)
|
|
|
|
| 17 |
self.logs = []
|
| 18 |
|
| 19 |
def get_health_report(self):
|
| 20 |
return dict(self.stats)
|
| 21 |
|
| 22 |
def enforce_contract(self, value, target_type, row_id="N/A", col_name="N/A"):
|
| 23 |
+
# 1. Gestion Nulls
|
|
|
|
|
|
|
|
|
|
| 24 |
if pd.isna(value) or str(value).strip().lower() in ['nan', 'none', '', 'null']:
|
| 25 |
self.stats["missing_values"] += 1
|
| 26 |
return None
|
|
|
|
| 30 |
try:
|
| 31 |
cleaned = None
|
| 32 |
|
| 33 |
+
# 2. Routage par type
|
| 34 |
if target_type in ["xsd:decimal", "xsd:float"]:
|
| 35 |
cleaned = self._clean_currency(val_str)
|
| 36 |
|
|
|
|
| 38 |
cleaned = self._clean_integer(val_str)
|
| 39 |
|
| 40 |
elif target_type == "xsd:date":
|
| 41 |
+
# On force le format YYYY-MM-DD (sans heure)
|
| 42 |
+
cleaned = self._standardize_date(val_str, want_time=False)
|
| 43 |
+
|
| 44 |
+
elif target_type == "xsd:dateTime":
|
| 45 |
+
# On garde l'heure si elle existe
|
| 46 |
+
cleaned = self._standardize_date(val_str, want_time=True)
|
| 47 |
|
| 48 |
elif target_type == "xsd:boolean":
|
| 49 |
cleaned = self._to_boolean(val_str)
|
|
|
|
| 51 |
else:
|
| 52 |
cleaned = self._clean_text(val_str)
|
| 53 |
|
|
|
|
| 54 |
self.stats["valid_entries"] += 1
|
| 55 |
return cleaned
|
| 56 |
|
| 57 |
except ValueError:
|
| 58 |
+
# 3. Capture de l'erreur pour le guide de correction
|
| 59 |
self.stats["rejected_contracts"] += 1
|
| 60 |
+
self.logs.append({
|
|
|
|
| 61 |
"📍 Ligne (ID)": str(row_id),
|
| 62 |
"📌 Colonne": str(col_name),
|
| 63 |
+
"❌ Valeur": val_str,
|
| 64 |
+
"⚠️ Attendu": target_type,
|
| 65 |
+
"Raison": "Format incompatible"
|
| 66 |
+
})
|
|
|
|
| 67 |
return None
|
| 68 |
|
| 69 |
+
# --- NETTOYAGE ---
|
| 70 |
def _clean_currency(self, val):
|
|
|
|
| 71 |
clean = re.sub(r'[^\d,.-]', '', val).replace(',', '.')
|
| 72 |
+
if not clean: raise ValueError("Vide après nettoyage")
|
| 73 |
return float(clean)
|
| 74 |
|
| 75 |
def _clean_integer(self, val):
|
| 76 |
+
return int(self._clean_currency(val))
|
| 77 |
+
|
| 78 |
+
def _standardize_date(self, val, want_time=False):
|
| 79 |
+
# Si on veut juste une date mais qu'on reçoit "YYYY-MM-DD HH:MM:SS", on coupe
|
| 80 |
+
if not want_time and " " in val:
|
| 81 |
+
val = val.split(" ")[0]
|
| 82 |
|
| 83 |
+
# Liste étendue des formats supportés (C'EST LA CLÉ DU SUCCÈS)
|
| 84 |
+
formats = [
|
| 85 |
+
("%d/%m/%Y", "%Y-%m-%d"), # 01/03/2026
|
| 86 |
+
("%d-%m-%Y", "%Y-%m-%d"), # 01-03-2026
|
| 87 |
+
("%Y-%m-%d", "%Y-%m-%d"), # 2026-03-01
|
| 88 |
+
# Formats avec Heure (trouvés dans tes logs)
|
| 89 |
+
("%d-%m-%Y %H:%M:%S", "%Y-%m-%dT%H:%M:%S"), # 13-01-2026 23:40:03
|
| 90 |
+
("%d/%m/%Y %H:%M:%S", "%Y-%m-%dT%H:%M:%S"),
|
| 91 |
+
("%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S")
|
| 92 |
]
|
| 93 |
+
|
| 94 |
+
for fmt_in, fmt_out in formats:
|
| 95 |
+
try:
|
| 96 |
+
dt = datetime.strptime(val, fmt_in)
|
| 97 |
+
if not want_time:
|
| 98 |
+
return dt.strftime("%Y-%m-%d")
|
| 99 |
+
return dt.strftime(fmt_out)
|
| 100 |
+
except ValueError:
|
| 101 |
+
continue
|
| 102 |
+
|
| 103 |
raise ValueError(f"Date invalide: {val}")
|
| 104 |
|
| 105 |
def _to_boolean(self, val):
|
| 106 |
+
return val.lower() in ['true', '1', 'oui', 'yes', 'vrai', 'active', 'actif']
|
| 107 |
|
| 108 |
def _clean_text(self, val):
|
| 109 |
return re.sub(r'\s+', ' ', val).strip()
|