Spaces:

Eric2mangel
/

Benfordslaw

Sleeping

App Files Files Community

Benfordslaw / utils.py

Eric2mangel

Upload 2 files

ef8b14c verified 5 months ago

raw

history blame contribute delete

9.84 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from scipy.stats import chisquare, chi2
	import tempfile
	import matplotlib

	# Configuration matplotlib
	matplotlib.use('Agg')

	def extraire_premier_chiffre(val):
	"""Extrait le premier chiffre significatif d'un nombre."""
	if pd.isna(val) or val <= 0:
	return None
	val_str = str(abs(val)).replace('.', '').lstrip('0')
	if val_str:
	return int(val_str[0])
	return None

	def calculer_benford_stats_et_graphique(data_a_tester, colonne_nom, N, html_output):
	"""
	Effectue les calculs statistiques et génère le graphique Benford.
	Retourne: (message_html_complet, chemin_graphique)
	"""

	# Fréquences théoriques de Benford
	benford_probs = [np.log10(1 + 1/d) for d in range(1, 10)]

	# --- Calcul des Statistiques ---
	first_digits = data_a_tester.apply(extraire_premier_chiffre).dropna()
	observed_counts = first_digits.value_counts().sort_index()

	# S'assurer que tous les chiffres 1-9 sont présents
	full_observed = [observed_counts.get(d, 0) for d in range(1, 10)]
	expected_counts = [p * len(first_digits) for p in benford_probs]

	# Calcul du MAD
	observed_freq = np.array(full_observed) / len(first_digits)
	expected_freq = np.array(benford_probs)
	mad_value = np.mean(np.abs(observed_freq - expected_freq))

	# Test du Chi-carré
	chi2_stat, p_value = chisquare(full_observed, expected_counts)
	chi2_critique = chi2.ppf(0.95, 8) # Environ 15.5

	# --- Création du Graphique Moderne ---
	fig, ax = plt.subplots(figsize=(10, 5.5))
	digits = list(range(1, 10))
	x = np.arange(len(digits))
	width = 0.38

	observed_pct = (np.array(full_observed) / len(first_digits)) * 100
	benford_pct = np.array(benford_probs) * 100

	# Couleurs douces et élégantes (Bleu océan + Corail doux)
	color_observed = '#3b82f6' # Bleu doux
	color_benford = '#f59e0b' # Ambre/Or

	bars1 = ax.bar(x - width/2, observed_pct, width, label='% Observé', alpha=0.85, color=color_observed, edgecolor='white', linewidth=1.5)
	bars2 = ax.bar(x + width/2, benford_pct, width, label='% Benford (théorique)', alpha=0.85, color=color_benford, edgecolor='white', linewidth=1.5)

	# Style moderne
	ax.set_xlabel('Premier chiffre', fontsize=13, fontweight='600', color='#1F2937')
	ax.set_ylabel('Fréquence (%)', fontsize=13, fontweight='600', color='#1F2937')
	ax.set_title(f'Test de Benford - {colonne_nom}', fontsize=13, fontweight='700', color='#111827', pad=12)
	ax.set_xticks(x)
	ax.set_xticklabels(digits, fontsize=10)
	ax.legend(fontsize=10, framealpha=0.95, edgecolor='#E5E7EB')
	ax.grid(axis='y', alpha=0.2, linestyle='--', linewidth=0.7)
	ax.set_facecolor('#FAFAFA')
	fig.patch.set_facecolor('white')

	# Sauvegarder le graphique
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
	plt.tight_layout()
	plt.savefig(temp_file.name, format='png', dpi=120, bbox_inches='tight', facecolor='white')
	plt.close(fig)

	# --- Affichage des Résultats (Design Compact et Moderne) ---

	# Style CSS moderne et compact
	html_output += """
	<style>
	.stats-grid {
	display: grid;
	grid-template-columns: repeat(3, 1fr);
	gap: 10px;
	margin: 12px 0;
	}
	.stat-card {
	background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%);
	padding: 12px;
	border-radius: 8px;
	text-align: center;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	color: white;
	}
	.stat-card.amber { background: linear-gradient(135deg, #f59e0b 0%, #d97706 100%); }
	.stat-card.emerald { background: linear-gradient(135deg, #10b981 0%, #059669 100%); }
	.stat-label { font-size: 11px; opacity: 0.9; font-weight: 500; margin-bottom: 3px; }
	.stat-value { font-size: 22px; font-weight: 700; }
	.conclusion-box {
	padding: 12px;
	border-radius: 8px;
	margin: 10px 0;
	border-left: 4px solid;
	font-size: 14px;
	}
	.conclusion-box.success { background: #f0fdf4; border-color: #10b981; color: #065f46; }
	.conclusion-box.error { background: #fef2f2; border-color: #ef4444; color: #991b1b; }
	.info-box {
	background: #f0f9ff;
	border-left: 3px solid #0284c7;
	padding: 10px;
	border-radius: 6px;
	margin: 8px 0;
	font-size: 13px;
	}
	.mad-badge {
	display: inline-block;
	padding: 4px 12px;
	border-radius: 16px;
	font-weight: 600;
	font-size: 12px;
	margin-top: 6px;
	}
	h3 { color: #111827; font-size: 15px; font-weight: 700; margin: 12px 0 6px 0; }
	</style>
	"""

	# Statistiques en cartes compactes
	html_output += "<h3>📊 Résultats</h3>"
	html_output += "<div class='stats-grid'>"
	html_output += f"""
	<div class='stat-card'>
	<div class='stat-label'>MAD</div>
	<div class='stat-value'>{mad_value:.4f}</div>
	</div>
	<div class='stat-card amber'>
	<div class='stat-label'>Chi-carré</div>
	<div class='stat-value'>{chi2_stat:.1f} ({chi2_stat/chi2_critique:.1f}x seuil)</div>
	</div>
	<div class='stat-card emerald'>
	<div class='stat-label'>P-value</div>
	<div class='stat-value'>{p_value:.4f}</div>
	</div>
	"""
	html_output += "</div>"

	# Conclusion compacte avec nuances intelligentes
	html_output += "<h3>🎯 Conclusion</h3>"

	if p_value < 0.05:
	html_output += "<div class='conclusion-box error'>"
	html_output += "<p style='font-weight: 700; font-size: 14px; margin: 0 0 6px 0;'>❌ Conformité REJETÉE (p={:.4f})</p>".format(p_value)

	# Nuance intelligente selon taille échantillon
	if N > 10000 and mad_value < 0.020:
	html_output += "<div class='info-box' style='margin-top: 8px;'>"
	html_output += f"<strong>⚠️ Contexte :</strong> Avec {N:,} obs, le test est hypersensible. "
	html_output += f"MAD de {mad_value:.4f} indique des écarts modérés (biais structurels probables)."
	html_output += "</div>"
	html_output += "</div>"
	else:
	html_output += "<div class='conclusion-box success'>"
	html_output += "<p style='font-weight: 700; font-size: 14px; margin: 0;'>✅ Conformité ACCEPTÉE (p={:.4f})</p>".format(p_value)
	html_output += "</div>"

	# Interprétation MAD compacte
	# html_output += "<h3>📈 Qualité (MAD)</h3>"

	# if mad_value < 0.006:
	# badge_style = "background: linear-gradient(135deg, #10b981, #059669); color: white;"
	# mad_text = "EXCELLENTE"
	# elif mad_value < 0.012:
	# badge_style = "background: linear-gradient(135deg, #22c55e, #16a34a); color: white;"
	# mad_text = "ACCEPTABLE"
	# elif mad_value < 0.015:
	# badge_style = "background: linear-gradient(135deg, #f59e0b, #d97706); color: white;"
	# mad_text = "MARGINALE"
	# elif mad_value < 0.020:
	# badge_style = "background: linear-gradient(135deg, #f97316, #ea580c); color: white;"
	# mad_text = "MODÉRÉE"
	# else:
	# badge_style = "background: linear-gradient(135deg, #ef4444, #dc2626); color: white;"
	# mad_text = "FORTE"

	# html_output += f"<span class='mad-badge' style='{badge_style}'>{mad_text}</span>"


	html_output += "<h3>🎯 Conformité MAD (Écart Absolu Moyen)</h3>"
	html_output += "<p style='font-size: 0.9em; color: #555;'>Le MAD mesure l'écart moyen entre les fréquences observées et les fréquences théoriques de Benford. Plus le MAD est petit, plus la conformité est forte.</p>"

	# Mise à jour des libellés pour indiquer la CONFORMITÉ (Inversé par rapport à la déviation)
	if mad_value < 0.006:
	badge_style = "background: linear-gradient(135deg, #10b981, #059669); color: white;"
	mad_text = "EXCELLENTE CONFORMITÉ"
	elif mad_value < 0.012:
	badge_style = "background: linear-gradient(135deg, #22c55e, #16a34a); color: white;"
	mad_text = "BONNE CONFORMITÉ"
	elif mad_value < 0.015:
	badge_style = "background: linear-gradient(135deg, #f59e0b, #d97706); color: white;"
	mad_text = "CONFORMITÉ MARGINALE"
	elif mad_value < 0.020:
	badge_style = "background: linear-gradient(135deg, #f97316, #ea580c); color: white;"
	mad_text = "FAIBLE CONFORMITÉ"
	else:
	# Si le MAD est ≥ 0.020, la déviation est forte, donc la conformité est très faible/absente.
	badge_style = "background: linear-gradient(135deg, #ef4444, #dc2626); color: white;"
	mad_text = "TRÈS FAIBLE CONFORMITÉ"

	html_output += f"<div style='margin-top: 10px; font-size: 1.1em; font-weight: bold; padding: 10px; text-align: center; {badge_style}'>"
	html_output += mad_text
	html_output += "</div>"

	# Guide ultra-compact
	#html_output += "<div style='background: #f9fafb; padding: 10px; border-radius: 6px; margin-top: 10px; font-size: 12px;'>"
	#html_output += f"<strong>{N:,}</strong> obs"

	#if N < 100:
	# html_output += " <span style='color: #f59e0b;'>⚠️ Petit</span>"
	#elif N >= 10000:
	# html_output += " <span style='color: #10b981;'>✅ Large (sensible)</span>"

	#html_output += f" • Chi² : <strong>{chi2_stat:.1f}</strong> ({chi2_stat/chi2_critique:.1f}x seuil)"
	#html_output += "</div>"
	html_output += "</div>"

	return html_output, temp_file.name