Spaces:

CCPA-GAIA
/

GAIA26CCPA

Sleeping

App Files Files Community

GAIA26CCPA / src /proxy_log.py

JosephMcDonnell

Modifs J2 (#9)

13bb974 about 1 month ago

raw

history blame contribute delete

4.94 kB

	"""
	proxy_log.py – Persistence layer for proxy/scenario selections.

	Each time the engine returns a result (single search or formulation row),
	the chosen proxy is logged. The CSV file lives next to app.py so it
	survives Streamlit restarts.

	Colonnes du CSV :
	timestamp, matiere_recherchee, proxy_choisi, scenario, impact_kg_co2_t,
	source_db, match_exact, pays_production, pays_transformation, type_mp
	"""
	from __future__ import annotations

	import csv
	import os
	from datetime import datetime
	from pathlib import Path
	from collections import Counter

	import pandas as pd

	# ---------------------------------------------------------------------------
	# Fichier de stockage
	# ---------------------------------------------------------------------------
	_LOG_DIR = Path(__file__).parent / "data"
	_LOG_FILE = _LOG_DIR / "proxy_selections.csv"

	_FIELDNAMES = [
	"timestamp",
	"matiere_recherchee",
	"proxy_choisi",
	"scenario",
	"impact_kg_co2_t",
	"source_db",
	"match_exact",
	"pays_production",
	"pays_transformation",
	"type_mp",
	]


	def _ensure_file() -> None:
	"""Crée le répertoire et le fichier CSV avec en-tête s'ils n'existent pas."""
	_LOG_DIR.mkdir(parents=True, exist_ok=True)
	if not _LOG_FILE.exists():
	with open(_LOG_FILE, "w", newline="", encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=_FIELDNAMES)
	writer.writeheader()


	# ---------------------------------------------------------------------------
	# Écriture
	# ---------------------------------------------------------------------------

	def log_selection(
	matiere_recherchee: str,
	proxy_choisi: str,
	scenario: str,
	impact_kg_co2_t: float \| None = None,
	source_db: str = "",
	match_exact: bool = True,
	pays_production: str = "",
	pays_transformation: str = "",
	type_mp: str = "vegetal_animal",
	) -> None:
	"""Enregistre une sélection de proxy dans le fichier CSV."""
	_ensure_file()
	row = {
	"timestamp": datetime.now().isoformat(timespec="seconds"),
	"matiere_recherchee": matiere_recherchee,
	"proxy_choisi": proxy_choisi,
	"scenario": scenario,
	"impact_kg_co2_t": round(impact_kg_co2_t, 2) if impact_kg_co2_t is not None else "",
	"source_db": source_db,
	"match_exact": "Oui" if match_exact else "Non",
	"pays_production": pays_production or "",
	"pays_transformation": pays_transformation or "",
	"type_mp": type_mp,
	}
	with open(_LOG_FILE, "a", newline="", encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=_FIELDNAMES)
	writer.writerow(row)


	# ---------------------------------------------------------------------------
	# Lecture / statistiques
	# ---------------------------------------------------------------------------

	def load_log() -> pd.DataFrame:
	"""Charge le journal complet sous forme de DataFrame."""
	_ensure_file()
	df = pd.read_csv(_LOG_FILE, encoding="utf-8")
	if "timestamp" in df.columns:
	df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
	return df


	def top_proxies(n: int = 20, days: int \| None = None) -> pd.DataFrame:
	"""Renvoie les n proxies les plus choisis (optionnel : sur les days derniers jours).

	Colonnes retournées : proxy_choisi, nb_selections, dernière_utilisation
	"""
	df = load_log()
	if df.empty:
	return pd.DataFrame(columns=["proxy_choisi", "nb_selections", "dernière_utilisation"])

	if days is not None and "timestamp" in df.columns:
	cutoff = pd.Timestamp.now() - pd.Timedelta(days=days)
	df = df[df["timestamp"] >= cutoff]

	if df.empty:
	return pd.DataFrame(columns=["proxy_choisi", "nb_selections", "dernière_utilisation"])

	stats = (
	df.groupby("proxy_choisi", sort=False)
	.agg(
	nb_selections=("proxy_choisi", "size"),
	dernière_utilisation=("timestamp", "max"),
	)
	.reset_index()
	.sort_values("nb_selections", ascending=False)
	.head(n)
	)
	return stats


	def top_scenarios(n: int = 20, days: int \| None = None) -> pd.DataFrame:
	"""Renvoie les n scénarios les plus fréquents."""
	df = load_log()
	if df.empty:
	return pd.DataFrame(columns=["scenario", "nb_selections", "dernière_utilisation"])

	if days is not None and "timestamp" in df.columns:
	cutoff = pd.Timestamp.now() - pd.Timedelta(days=days)
	df = df[df["timestamp"] >= cutoff]

	if df.empty:
	return pd.DataFrame(columns=["scenario", "nb_selections", "dernière_utilisation"])

	stats = (
	df.groupby("scenario", sort=False)
	.agg(
	nb_selections=("scenario", "size"),
	dernière_utilisation=("timestamp", "max"),
	)
	.reset_index()
	.sort_values("nb_selections", ascending=False)
	.head(n)
	)
	return stats