Spaces:
Running
Running
Update src/Analytics/Pipeline_Builder.py
Browse files
src/Analytics/Pipeline_Builder.py
CHANGED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MODULE: PIPELINE BUILDER (DATA REFINERY)
|
| 3 |
+
========================================
|
| 4 |
+
Responsabilité :
|
| 5 |
+
1. Standardiser et Nettoyer les données (Dates, Nombres, Strings).
|
| 6 |
+
2. Appliquer les règles métier AVANT l'ingestion RDF.
|
| 7 |
+
3. Garantir les "Contrats de Données" (Data Contracts).
|
| 8 |
+
"""
|
| 9 |
+
import re
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
|
| 13 |
+
class PipelineBuilder:
|
| 14 |
+
def __init__(self):
|
| 15 |
+
self.transformations_log = []
|
| 16 |
+
|
| 17 |
+
def enforce_contract(self, value, target_type):
|
| 18 |
+
"""
|
| 19 |
+
Point d'entrée unique pour transformer une valeur brute
|
| 20 |
+
en valeur conforme au schéma cible (T-Box).
|
| 21 |
+
"""
|
| 22 |
+
if pd.isna(value) or str(value).strip().lower() in ['nan', 'none', '', 'null']:
|
| 23 |
+
return None
|
| 24 |
+
|
| 25 |
+
val_str = str(value).strip()
|
| 26 |
+
|
| 27 |
+
# --- ROUTAGE SELON LE TYPE CIBLE ---
|
| 28 |
+
if target_type == "xsd:decimal" or target_type == "xsd:float":
|
| 29 |
+
return self._clean_currency(val_str)
|
| 30 |
+
|
| 31 |
+
elif target_type == "xsd:integer":
|
| 32 |
+
return self._clean_integer(val_str)
|
| 33 |
+
|
| 34 |
+
elif target_type == "xsd:date":
|
| 35 |
+
return self._standardize_date(val_str)
|
| 36 |
+
|
| 37 |
+
elif target_type == "xsd:boolean":
|
| 38 |
+
return self._to_boolean(val_str)
|
| 39 |
+
|
| 40 |
+
# Par défaut : Nettoyage de texte standard
|
| 41 |
+
return self._clean_text(val_str)
|
| 42 |
+
|
| 43 |
+
# --- LOGIQUE DE TRANSFORMATION (Unitaires) ---
|
| 44 |
+
|
| 45 |
+
def _clean_currency(self, val):
|
| 46 |
+
"""Transforme '20 000 FCFA' en 20000.0"""
|
| 47 |
+
try:
|
| 48 |
+
# Retire tout ce qui n'est pas chiffre, point ou virgule
|
| 49 |
+
clean = re.sub(r'[^\d,.-]', '', val)
|
| 50 |
+
clean = clean.replace(',', '.') # Standardisation US
|
| 51 |
+
return float(clean)
|
| 52 |
+
except:
|
| 53 |
+
return 0.0 # Ou None selon la rigueur souhaitée
|
| 54 |
+
|
| 55 |
+
def _clean_integer(self, val):
|
| 56 |
+
"""Transforme '12 mois' en 12"""
|
| 57 |
+
try:
|
| 58 |
+
val_float = self._clean_currency(val)
|
| 59 |
+
return int(val_float)
|
| 60 |
+
except:
|
| 61 |
+
return 0
|
| 62 |
+
|
| 63 |
+
def _standardize_date(self, val):
|
| 64 |
+
"""Transforme n'importe quel format en YYYY-MM-DD (ISO 8601)"""
|
| 65 |
+
# Formats supportés : DD/MM/YYYY, DD-MM-YYYY, YYYY-MM-DD
|
| 66 |
+
patterns = [
|
| 67 |
+
(r'(\d{2})[-/](\d{2})[-/](\d{4})', lambda m: f"{m.group(3)}-{m.group(2)}-{m.group(1)}"), # 29-01-2026 -> 2026-01-29
|
| 68 |
+
(r'(\d{4})[-/](\d{2})[-/](\d{2})', lambda m: f"{m.group(1)}-{m.group(2)}-{m.group(3)}") # Déjà ISO
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
for pat, converter in patterns:
|
| 72 |
+
match = re.search(pat, val)
|
| 73 |
+
if match:
|
| 74 |
+
return converter(match)
|
| 75 |
+
|
| 76 |
+
return val # Retourne brut si échec (pour logguer l'erreur plus tard)
|
| 77 |
+
|
| 78 |
+
def _to_boolean(self, val):
|
| 79 |
+
return val.lower() in ['true', '1', 'oui', 'yes', 'vrai', 'active']
|
| 80 |
+
|
| 81 |
+
def _clean_text(self, val):
|
| 82 |
+
"""Retire les espaces superflus et caractères invisibles"""
|
| 83 |
+
return re.sub(r'\s+', ' ', val).strip()
|