Spaces:

CCPA-GAIA
/

GAIA26CCPA

Sleeping

File size: 4,940 Bytes

13bb974

"""
proxy_log.py – Persistence layer for proxy/scenario selections.

Each time the engine returns a result (single search or formulation row),
the chosen proxy is logged.  The CSV file lives next to app.py so it
survives Streamlit restarts.

Colonnes du CSV :
    timestamp, matiere_recherchee, proxy_choisi, scenario, impact_kg_co2_t,
    source_db, match_exact, pays_production, pays_transformation, type_mp
"""
from __future__ import annotations

import csv
import os
from datetime import datetime
from pathlib import Path
from collections import Counter

import pandas as pd

# ---------------------------------------------------------------------------
# Fichier de stockage
# ---------------------------------------------------------------------------
_LOG_DIR = Path(__file__).parent / "data"
_LOG_FILE = _LOG_DIR / "proxy_selections.csv"

_FIELDNAMES = [
    "timestamp",
    "matiere_recherchee",
    "proxy_choisi",
    "scenario",
    "impact_kg_co2_t",
    "source_db",
    "match_exact",
    "pays_production",
    "pays_transformation",
    "type_mp",
]


def _ensure_file() -> None:
    """Crée le répertoire et le fichier CSV avec en-tête s'ils n'existent pas."""
    _LOG_DIR.mkdir(parents=True, exist_ok=True)
    if not _LOG_FILE.exists():
        with open(_LOG_FILE, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=_FIELDNAMES)
            writer.writeheader()


# ---------------------------------------------------------------------------
# Écriture
# ---------------------------------------------------------------------------

def log_selection(
    matiere_recherchee: str,
    proxy_choisi: str,
    scenario: str,
    impact_kg_co2_t: float | None = None,
    source_db: str = "",
    match_exact: bool = True,
    pays_production: str = "",
    pays_transformation: str = "",
    type_mp: str = "vegetal_animal",
) -> None:
    """Enregistre une sélection de proxy dans le fichier CSV."""
    _ensure_file()
    row = {
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "matiere_recherchee": matiere_recherchee,
        "proxy_choisi": proxy_choisi,
        "scenario": scenario,
        "impact_kg_co2_t": round(impact_kg_co2_t, 2) if impact_kg_co2_t is not None else "",
        "source_db": source_db,
        "match_exact": "Oui" if match_exact else "Non",
        "pays_production": pays_production or "",
        "pays_transformation": pays_transformation or "",
        "type_mp": type_mp,
    }
    with open(_LOG_FILE, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=_FIELDNAMES)
        writer.writerow(row)


# ---------------------------------------------------------------------------
# Lecture / statistiques
# ---------------------------------------------------------------------------

def load_log() -> pd.DataFrame:
    """Charge le journal complet sous forme de DataFrame."""
    _ensure_file()
    df = pd.read_csv(_LOG_FILE, encoding="utf-8")
    if "timestamp" in df.columns:
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    return df


def top_proxies(n: int = 20, days: int | None = None) -> pd.DataFrame:
    """Renvoie les *n* proxies les plus choisis (optionnel : sur les *days* derniers jours).

    Colonnes retournées : proxy_choisi, nb_selections, dernière_utilisation
    """
    df = load_log()
    if df.empty:
        return pd.DataFrame(columns=["proxy_choisi", "nb_selections", "dernière_utilisation"])

    if days is not None and "timestamp" in df.columns:
        cutoff = pd.Timestamp.now() - pd.Timedelta(days=days)
        df = df[df["timestamp"] >= cutoff]

    if df.empty:
        return pd.DataFrame(columns=["proxy_choisi", "nb_selections", "dernière_utilisation"])

    stats = (
        df.groupby("proxy_choisi", sort=False)
        .agg(
            nb_selections=("proxy_choisi", "size"),
            dernière_utilisation=("timestamp", "max"),
        )
        .reset_index()
        .sort_values("nb_selections", ascending=False)
        .head(n)
    )
    return stats


def top_scenarios(n: int = 20, days: int | None = None) -> pd.DataFrame:
    """Renvoie les *n* scénarios les plus fréquents."""
    df = load_log()
    if df.empty:
        return pd.DataFrame(columns=["scenario", "nb_selections", "dernière_utilisation"])

    if days is not None and "timestamp" in df.columns:
        cutoff = pd.Timestamp.now() - pd.Timedelta(days=days)
        df = df[df["timestamp"] >= cutoff]

    if df.empty:
        return pd.DataFrame(columns=["scenario", "nb_selections", "dernière_utilisation"])

    stats = (
        df.groupby("scenario", sort=False)
        .agg(
            nb_selections=("scenario", "size"),
            dernière_utilisation=("timestamp", "max"),
        )
        .reset_index()
        .sort_values("nb_selections", ascending=False)
        .head(n)
    )
    return stats