Spaces:
Running
Running
| """Disk-based Parquet cache for historical GBIF occurrences (YEAR_FROM … HIST_YEAR_TO). | |
| Cache files live in data/species_cache/ and are named: | |
| {taxon_key}_{scope}.parquet | |
| where scope is a country code (e.g. "DE") or "worldwide". | |
| Fallback rule: if no country-specific cache exists but a "worldwide" cache does, | |
| load_best() returns the worldwide data filtered to the requested country in-memory. | |
| """ | |
| import pandas as pd | |
| from pathlib import Path | |
| from datetime import datetime | |
| from src.utils import CACHE_DIR, YEAR_FROM, HIST_YEAR_TO | |
| def _scope_str(country: str | None) -> str: | |
| return country if country else "worldwide" | |
| def cache_path(taxon_key: int, country: str | None) -> Path: | |
| CACHE_DIR.mkdir(parents=True, exist_ok=True) | |
| return CACHE_DIR / f"{taxon_key}_{_scope_str(country)}.parquet" | |
| def cache_exists(taxon_key: int, country: str | None) -> bool: | |
| if cache_path(taxon_key, country).exists(): | |
| return True | |
| # worldwide cache can serve any country filter | |
| if country is not None: | |
| return cache_path(taxon_key, None).exists() | |
| return False | |
| def cache_info(taxon_key: int, country: str | None) -> dict | None: | |
| """Return size (MB), row count, modification date, and whether worldwide fallback is used.""" | |
| p = cache_path(taxon_key, country) | |
| using_fallback = False | |
| if not p.exists() and country is not None: | |
| p = cache_path(taxon_key, None) | |
| using_fallback = True | |
| if not p.exists(): | |
| return None | |
| size_mb = p.stat().st_size / 1_048_576 | |
| mtime = datetime.fromtimestamp(p.stat().st_mtime) | |
| try: | |
| df_meta = pd.read_parquet(p, columns=["year"]) | |
| rows = len(df_meta) | |
| except Exception: | |
| rows = -1 | |
| return { | |
| "size_mb": size_mb, | |
| "rows": rows, | |
| "modified": mtime, | |
| "path": p, | |
| "worldwide_fallback": using_fallback, | |
| } | |
| def load_historical(taxon_key: int, country: str | None) -> pd.DataFrame: | |
| """Load cached data. If no country-specific cache, falls back to worldwide + in-memory filter.""" | |
| p = cache_path(taxon_key, country) | |
| if p.exists(): | |
| return pd.read_parquet(p) | |
| # Worldwide fallback | |
| if country is not None: | |
| p_world = cache_path(taxon_key, None) | |
| if p_world.exists(): | |
| df = pd.read_parquet(p_world) | |
| if "countryCode" in df.columns: | |
| return df[df["countryCode"] == country].copy() | |
| return df | |
| return pd.DataFrame() | |
| def save_historical(taxon_key: int, country: str | None, df: pd.DataFrame) -> Path: | |
| p = cache_path(taxon_key, country) | |
| df.to_parquet(p, index=False) | |
| return p | |
| def delete_cache(taxon_key: int, country: str | None) -> bool: | |
| p = cache_path(taxon_key, country) | |
| if p.exists(): | |
| p.unlink() | |
| return True | |
| return False | |
| def historical_year_range() -> tuple[int, int]: | |
| return YEAR_FROM, HIST_YEAR_TO | |