Benfordslaw / utils.py
Eric2mangel's picture
Upload 2 files
ef8b14c verified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chisquare, chi2
import tempfile
import matplotlib
# Configuration matplotlib
matplotlib.use('Agg')
def extraire_premier_chiffre(val):
"""Extrait le premier chiffre significatif d'un nombre."""
if pd.isna(val) or val <= 0:
return None
val_str = str(abs(val)).replace('.', '').lstrip('0')
if val_str:
return int(val_str[0])
return None
def calculer_benford_stats_et_graphique(data_a_tester, colonne_nom, N, html_output):
"""
Effectue les calculs statistiques et génère le graphique Benford.
Retourne: (message_html_complet, chemin_graphique)
"""
# Fréquences théoriques de Benford
benford_probs = [np.log10(1 + 1/d) for d in range(1, 10)]
# --- Calcul des Statistiques ---
first_digits = data_a_tester.apply(extraire_premier_chiffre).dropna()
observed_counts = first_digits.value_counts().sort_index()
# S'assurer que tous les chiffres 1-9 sont présents
full_observed = [observed_counts.get(d, 0) for d in range(1, 10)]
expected_counts = [p * len(first_digits) for p in benford_probs]
# Calcul du MAD
observed_freq = np.array(full_observed) / len(first_digits)
expected_freq = np.array(benford_probs)
mad_value = np.mean(np.abs(observed_freq - expected_freq))
# Test du Chi-carré
chi2_stat, p_value = chisquare(full_observed, expected_counts)
chi2_critique = chi2.ppf(0.95, 8) # Environ 15.5
# --- Création du Graphique Moderne ---
fig, ax = plt.subplots(figsize=(10, 5.5))
digits = list(range(1, 10))
x = np.arange(len(digits))
width = 0.38
observed_pct = (np.array(full_observed) / len(first_digits)) * 100
benford_pct = np.array(benford_probs) * 100
# Couleurs douces et élégantes (Bleu océan + Corail doux)
color_observed = '#3b82f6' # Bleu doux
color_benford = '#f59e0b' # Ambre/Or
bars1 = ax.bar(x - width/2, observed_pct, width, label='% Observé', alpha=0.85, color=color_observed, edgecolor='white', linewidth=1.5)
bars2 = ax.bar(x + width/2, benford_pct, width, label='% Benford (théorique)', alpha=0.85, color=color_benford, edgecolor='white', linewidth=1.5)
# Style moderne
ax.set_xlabel('Premier chiffre', fontsize=13, fontweight='600', color='#1F2937')
ax.set_ylabel('Fréquence (%)', fontsize=13, fontweight='600', color='#1F2937')
ax.set_title(f'Test de Benford - {colonne_nom}', fontsize=13, fontweight='700', color='#111827', pad=12)
ax.set_xticks(x)
ax.set_xticklabels(digits, fontsize=10)
ax.legend(fontsize=10, framealpha=0.95, edgecolor='#E5E7EB')
ax.grid(axis='y', alpha=0.2, linestyle='--', linewidth=0.7)
ax.set_facecolor('#FAFAFA')
fig.patch.set_facecolor('white')
# Sauvegarder le graphique
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
plt.tight_layout()
plt.savefig(temp_file.name, format='png', dpi=120, bbox_inches='tight', facecolor='white')
plt.close(fig)
# --- Affichage des Résultats (Design Compact et Moderne) ---
# Style CSS moderne et compact
html_output += """
<style>
.stats-grid {
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 10px;
margin: 12px 0;
}
.stat-card {
background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%);
padding: 12px;
border-radius: 8px;
text-align: center;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
color: white;
}
.stat-card.amber { background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%); }
.stat-card.emerald { background: linear-gradient(135deg, #10b981 0%, #059669 100%); }
.stat-label { font-size: 11px; opacity: 0.9; font-weight: 500; margin-bottom: 3px; }
.stat-value { font-size: 22px; font-weight: 700; }
.conclusion-box {
padding: 12px;
border-radius: 8px;
margin: 10px 0;
border-left: 4px solid;
font-size: 14px;
}
.conclusion-box.success { background: #f0fdf4; border-color: #10b981; color: #065f46; }
.conclusion-box.error { background: #fef2f2; border-color: #ef4444; color: #991b1b; }
.info-box {
background: #f0f9ff;
border-left: 3px solid #0284c7;
padding: 10px;
border-radius: 6px;
margin: 8px 0;
font-size: 13px;
}
.mad-badge {
display: inline-block;
padding: 4px 12px;
border-radius: 16px;
font-weight: 600;
font-size: 12px;
margin-top: 6px;
}
h3 { color: #111827; font-size: 15px; font-weight: 700; margin: 12px 0 6px 0; }
</style>
"""
# Statistiques en cartes compactes
html_output += "<h3>📊 Résultats</h3>"
html_output += "<div class='stats-grid'>"
html_output += f"""
<div class='stat-card'>
<div class='stat-label'>MAD</div>
<div class='stat-value'>{mad_value:.4f}</div>
</div>
<div class='stat-card amber'>
<div class='stat-label'>Chi-carré</div>
<div class='stat-value'>{chi2_stat:.1f} ({chi2_stat/chi2_critique:.1f}x seuil)</div>
</div>
<div class='stat-card emerald'>
<div class='stat-label'>P-value</div>
<div class='stat-value'>{p_value:.4f}</div>
</div>
"""
html_output += "</div>"
# Conclusion compacte avec nuances intelligentes
html_output += "<h3>🎯 Conclusion</h3>"
if p_value < 0.05:
html_output += "<div class='conclusion-box error'>"
html_output += "<p style='font-weight: 700; font-size: 14px; margin: 0 0 6px 0;'>❌ Conformité REJETÉE (p={:.4f})</p>".format(p_value)
# Nuance intelligente selon taille échantillon
if N > 10000 and mad_value < 0.020:
html_output += "<div class='info-box' style='margin-top: 8px;'>"
html_output += f"<strong>⚠️ Contexte :</strong> Avec {N:,} obs, le test est hypersensible. "
html_output += f"MAD de {mad_value:.4f} indique des écarts modérés (biais structurels probables)."
html_output += "</div>"
html_output += "</div>"
else:
html_output += "<div class='conclusion-box success'>"
html_output += "<p style='font-weight: 700; font-size: 14px; margin: 0;'>✅ Conformité ACCEPTÉE (p={:.4f})</p>".format(p_value)
html_output += "</div>"
# Interprétation MAD compacte
# html_output += "<h3>📈 Qualité (MAD)</h3>"
# if mad_value < 0.006:
# badge_style = "background: linear-gradient(135deg, #10b981, #059669); color: white;"
# mad_text = "EXCELLENTE"
# elif mad_value < 0.012:
# badge_style = "background: linear-gradient(135deg, #22c55e, #16a34a); color: white;"
# mad_text = "ACCEPTABLE"
# elif mad_value < 0.015:
# badge_style = "background: linear-gradient(135deg, #f59e0b, #d97706); color: white;"
# mad_text = "MARGINALE"
# elif mad_value < 0.020:
# badge_style = "background: linear-gradient(135deg, #f97316, #ea580c); color: white;"
# mad_text = "MODÉRÉE"
# else:
# badge_style = "background: linear-gradient(135deg, #ef4444, #dc2626); color: white;"
# mad_text = "FORTE"
# html_output += f"<span class='mad-badge' style='{badge_style}'>{mad_text}</span>"
html_output += "<h3>🎯 Conformité MAD (Écart Absolu Moyen)</h3>"
html_output += "<p style='font-size: 0.9em; color: #555;'>Le MAD mesure l'écart moyen entre les fréquences observées et les fréquences théoriques de Benford. Plus le MAD est **petit**, plus la conformité est **forte**.</p>"
# Mise à jour des libellés pour indiquer la CONFORMITÉ (Inversé par rapport à la déviation)
if mad_value < 0.006:
badge_style = "background: linear-gradient(135deg, #10b981, #059669); color: white;"
mad_text = "EXCELLENTE CONFORMITÉ"
elif mad_value < 0.012:
badge_style = "background: linear-gradient(135deg, #22c55e, #16a34a); color: white;"
mad_text = "BONNE CONFORMITÉ"
elif mad_value < 0.015:
badge_style = "background: linear-gradient(135deg, #f59e0b, #d97706); color: white;"
mad_text = "CONFORMITÉ MARGINALE"
elif mad_value < 0.020:
badge_style = "background: linear-gradient(135deg, #f97316, #ea580c); color: white;"
mad_text = "FAIBLE CONFORMITÉ"
else:
# Si le MAD est ≥ 0.020, la déviation est forte, donc la conformité est très faible/absente.
badge_style = "background: linear-gradient(135deg, #ef4444, #dc2626); color: white;"
mad_text = "TRÈS FAIBLE CONFORMITÉ"
html_output += f"<div style='margin-top: 10px; font-size: 1.1em; font-weight: bold; padding: 10px; text-align: center; {badge_style}'>"
html_output += mad_text
html_output += "</div>"
# Guide ultra-compact
#html_output += "<div style='background: #f9fafb; padding: 10px; border-radius: 6px; margin-top: 10px; font-size: 12px;'>"
#html_output += f"<strong>{N:,}</strong> obs"
#if N < 100:
# html_output += " <span style='color: #f59e0b;'>⚠️ Petit</span>"
#elif N >= 10000:
# html_output += " <span style='color: #10b981;'>✅ Large (sensible)</span>"
#html_output += f" • Chi² : <strong>{chi2_stat:.1f}</strong> ({chi2_stat/chi2_critique:.1f}x seuil)"
#html_output += "</div>"
html_output += "</div>"
return html_output, temp_file.name