ArtenTracker / src /data_cache.py
Johannes
Initial deployment (no data - downloaded from HF Dataset at startup)
0d4a0ba
"""Disk-based Parquet cache for historical GBIF occurrences (YEAR_FROM … HIST_YEAR_TO).
Cache files live in data/species_cache/ and are named:
{taxon_key}_{scope}.parquet
where scope is a country code (e.g. "DE") or "worldwide".
Fallback rule: if no country-specific cache exists but a "worldwide" cache does,
load_best() returns the worldwide data filtered to the requested country in-memory.
"""
import pandas as pd
from pathlib import Path
from datetime import datetime
from src.utils import CACHE_DIR, YEAR_FROM, HIST_YEAR_TO
def _scope_str(country: str | None) -> str:
return country if country else "worldwide"
def cache_path(taxon_key: int, country: str | None) -> Path:
CACHE_DIR.mkdir(parents=True, exist_ok=True)
return CACHE_DIR / f"{taxon_key}_{_scope_str(country)}.parquet"
def cache_exists(taxon_key: int, country: str | None) -> bool:
if cache_path(taxon_key, country).exists():
return True
# worldwide cache can serve any country filter
if country is not None:
return cache_path(taxon_key, None).exists()
return False
def cache_info(taxon_key: int, country: str | None) -> dict | None:
"""Return size (MB), row count, modification date, and whether worldwide fallback is used."""
p = cache_path(taxon_key, country)
using_fallback = False
if not p.exists() and country is not None:
p = cache_path(taxon_key, None)
using_fallback = True
if not p.exists():
return None
size_mb = p.stat().st_size / 1_048_576
mtime = datetime.fromtimestamp(p.stat().st_mtime)
try:
df_meta = pd.read_parquet(p, columns=["year"])
rows = len(df_meta)
except Exception:
rows = -1
return {
"size_mb": size_mb,
"rows": rows,
"modified": mtime,
"path": p,
"worldwide_fallback": using_fallback,
}
def load_historical(taxon_key: int, country: str | None) -> pd.DataFrame:
"""Load cached data. If no country-specific cache, falls back to worldwide + in-memory filter."""
p = cache_path(taxon_key, country)
if p.exists():
return pd.read_parquet(p)
# Worldwide fallback
if country is not None:
p_world = cache_path(taxon_key, None)
if p_world.exists():
df = pd.read_parquet(p_world)
if "countryCode" in df.columns:
return df[df["countryCode"] == country].copy()
return df
return pd.DataFrame()
def save_historical(taxon_key: int, country: str | None, df: pd.DataFrame) -> Path:
p = cache_path(taxon_key, country)
df.to_parquet(p, index=False)
return p
def delete_cache(taxon_key: int, country: str | None) -> bool:
p = cache_path(taxon_key, country)
if p.exists():
p.unlink()
return True
return False
def historical_year_range() -> tuple[int, int]:
return YEAR_FROM, HIST_YEAR_TO