"""Disk-based Parquet cache for historical GBIF occurrences (YEAR_FROM … HIST_YEAR_TO). Cache files live in data/species_cache/ and are named: {taxon_key}_{scope}.parquet where scope is a country code (e.g. "DE") or "worldwide". Fallback rule: if no country-specific cache exists but a "worldwide" cache does, load_best() returns the worldwide data filtered to the requested country in-memory. """ import pandas as pd from pathlib import Path from datetime import datetime from src.utils import CACHE_DIR, YEAR_FROM, HIST_YEAR_TO def _scope_str(country: str | None) -> str: return country if country else "worldwide" def cache_path(taxon_key: int, country: str | None) -> Path: CACHE_DIR.mkdir(parents=True, exist_ok=True) return CACHE_DIR / f"{taxon_key}_{_scope_str(country)}.parquet" def cache_exists(taxon_key: int, country: str | None) -> bool: if cache_path(taxon_key, country).exists(): return True # worldwide cache can serve any country filter if country is not None: return cache_path(taxon_key, None).exists() return False def cache_info(taxon_key: int, country: str | None) -> dict | None: """Return size (MB), row count, modification date, and whether worldwide fallback is used.""" p = cache_path(taxon_key, country) using_fallback = False if not p.exists() and country is not None: p = cache_path(taxon_key, None) using_fallback = True if not p.exists(): return None size_mb = p.stat().st_size / 1_048_576 mtime = datetime.fromtimestamp(p.stat().st_mtime) try: df_meta = pd.read_parquet(p, columns=["year"]) rows = len(df_meta) except Exception: rows = -1 return { "size_mb": size_mb, "rows": rows, "modified": mtime, "path": p, "worldwide_fallback": using_fallback, } def load_historical(taxon_key: int, country: str | None) -> pd.DataFrame: """Load cached data. If no country-specific cache, falls back to worldwide + in-memory filter.""" p = cache_path(taxon_key, country) if p.exists(): return pd.read_parquet(p) # Worldwide fallback if country is not None: p_world = cache_path(taxon_key, None) if p_world.exists(): df = pd.read_parquet(p_world) if "countryCode" in df.columns: return df[df["countryCode"] == country].copy() return df return pd.DataFrame() def save_historical(taxon_key: int, country: str | None, df: pd.DataFrame) -> Path: p = cache_path(taxon_key, country) df.to_parquet(p, index=False) return p def delete_cache(taxon_key: int, country: str | None) -> bool: p = cache_path(taxon_key, country) if p.exists(): p.unlink() return True return False def historical_year_range() -> tuple[int, int]: return YEAR_FROM, HIST_YEAR_TO