GAIA26CCPA / src /proxy_log.py
JosephMcDonnell's picture
Modifs J2 (#9)
13bb974
"""
proxy_log.py – Persistence layer for proxy/scenario selections.
Each time the engine returns a result (single search or formulation row),
the chosen proxy is logged. The CSV file lives next to app.py so it
survives Streamlit restarts.
Colonnes du CSV :
timestamp, matiere_recherchee, proxy_choisi, scenario, impact_kg_co2_t,
source_db, match_exact, pays_production, pays_transformation, type_mp
"""
from __future__ import annotations
import csv
import os
from datetime import datetime
from pathlib import Path
from collections import Counter
import pandas as pd
# ---------------------------------------------------------------------------
# Fichier de stockage
# ---------------------------------------------------------------------------
_LOG_DIR = Path(__file__).parent / "data"
_LOG_FILE = _LOG_DIR / "proxy_selections.csv"
_FIELDNAMES = [
"timestamp",
"matiere_recherchee",
"proxy_choisi",
"scenario",
"impact_kg_co2_t",
"source_db",
"match_exact",
"pays_production",
"pays_transformation",
"type_mp",
]
def _ensure_file() -> None:
"""Crée le répertoire et le fichier CSV avec en-tête s'ils n'existent pas."""
_LOG_DIR.mkdir(parents=True, exist_ok=True)
if not _LOG_FILE.exists():
with open(_LOG_FILE, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=_FIELDNAMES)
writer.writeheader()
# ---------------------------------------------------------------------------
# Écriture
# ---------------------------------------------------------------------------
def log_selection(
matiere_recherchee: str,
proxy_choisi: str,
scenario: str,
impact_kg_co2_t: float | None = None,
source_db: str = "",
match_exact: bool = True,
pays_production: str = "",
pays_transformation: str = "",
type_mp: str = "vegetal_animal",
) -> None:
"""Enregistre une sélection de proxy dans le fichier CSV."""
_ensure_file()
row = {
"timestamp": datetime.now().isoformat(timespec="seconds"),
"matiere_recherchee": matiere_recherchee,
"proxy_choisi": proxy_choisi,
"scenario": scenario,
"impact_kg_co2_t": round(impact_kg_co2_t, 2) if impact_kg_co2_t is not None else "",
"source_db": source_db,
"match_exact": "Oui" if match_exact else "Non",
"pays_production": pays_production or "",
"pays_transformation": pays_transformation or "",
"type_mp": type_mp,
}
with open(_LOG_FILE, "a", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=_FIELDNAMES)
writer.writerow(row)
# ---------------------------------------------------------------------------
# Lecture / statistiques
# ---------------------------------------------------------------------------
def load_log() -> pd.DataFrame:
"""Charge le journal complet sous forme de DataFrame."""
_ensure_file()
df = pd.read_csv(_LOG_FILE, encoding="utf-8")
if "timestamp" in df.columns:
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
return df
def top_proxies(n: int = 20, days: int | None = None) -> pd.DataFrame:
"""Renvoie les *n* proxies les plus choisis (optionnel : sur les *days* derniers jours).
Colonnes retournées : proxy_choisi, nb_selections, dernière_utilisation
"""
df = load_log()
if df.empty:
return pd.DataFrame(columns=["proxy_choisi", "nb_selections", "dernière_utilisation"])
if days is not None and "timestamp" in df.columns:
cutoff = pd.Timestamp.now() - pd.Timedelta(days=days)
df = df[df["timestamp"] >= cutoff]
if df.empty:
return pd.DataFrame(columns=["proxy_choisi", "nb_selections", "dernière_utilisation"])
stats = (
df.groupby("proxy_choisi", sort=False)
.agg(
nb_selections=("proxy_choisi", "size"),
dernière_utilisation=("timestamp", "max"),
)
.reset_index()
.sort_values("nb_selections", ascending=False)
.head(n)
)
return stats
def top_scenarios(n: int = 20, days: int | None = None) -> pd.DataFrame:
"""Renvoie les *n* scénarios les plus fréquents."""
df = load_log()
if df.empty:
return pd.DataFrame(columns=["scenario", "nb_selections", "dernière_utilisation"])
if days is not None and "timestamp" in df.columns:
cutoff = pd.Timestamp.now() - pd.Timedelta(days=days)
df = df[df["timestamp"] >= cutoff]
if df.empty:
return pd.DataFrame(columns=["scenario", "nb_selections", "dernière_utilisation"])
stats = (
df.groupby("scenario", sort=False)
.agg(
nb_selections=("scenario", "size"),
dernière_utilisation=("timestamp", "max"),
)
.reset_index()
.sort_values("nb_selections", ascending=False)
.head(n)
)
return stats