fossil_app / fossil_library.py
piperod91's picture
Fossil links: map old prj_fossil_unknown pages to new FossilLeafLens pages/ pattern
5f3d314
import os
import random
import csv
from dataclasses import dataclass
from typing import Literal
Selection = Literal["Plausible", "Not Sure", "Impossible", "Unlabeled"]
@dataclass(frozen=True)
class FossilEntry:
fossil_id: str
selection: Selection
image_url: str
page_url: str
FOSSIL_LEAF_LENS_URL = "https://serre-lab.github.io/FossilLeafLens/"
_OLD_PAGE_PREFIX = "https://serre-lab.github.io/prj_fossil_unknown/pages/unknown/"
_NEW_PAGE_PREFIX = "https://serre-lab.github.io/FossilLeafLens/pages/"
INFERENCE_IMAGE_BASE_URL = "https://storage.googleapis.com/serrelab/fossil_lens/inference_concepts2/"
def load_fossil_entries_from_csv(csv_path: str, *, shuffle: bool = True) -> list[FossilEntry]:
"""
Load Florissant unidentified fossils from the flattened CSV, excluding Not Applicable entries.
Expected columns:
- fossil_id
- user_selection
- is_applicable (0/1)
- image_url
- page_url
"""
if not os.path.exists(csv_path):
raise FileNotFoundError(f"Fossil library CSV not found: {csv_path}")
entries: dict[str, FossilEntry] = {}
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
fossil_id = (row.get("fossil_id") or "").strip()
if not fossil_id:
continue
is_applicable = (row.get("is_applicable") or "").strip()
if is_applicable == "0":
continue # Exclude Not Applicable
selection_raw = (row.get("user_selection") or "").strip()
selection: Selection
if selection_raw in ("Plausible", "Not Sure", "Impossible", "Unlabeled"):
selection = selection_raw # type: ignore[assignment]
else:
selection = "Unlabeled"
image_url = (row.get("image_url") or f"{INFERENCE_IMAGE_BASE_URL}{fossil_id}/image.jpg").strip()
# Map old prj_fossil_unknown pages to the new FossilLeafLens pages pattern.
raw_page_url = (row.get("page_url") or "").strip()
if raw_page_url.startswith(_OLD_PAGE_PREFIX):
page_url = raw_page_url.replace(_OLD_PAGE_PREFIX, _NEW_PAGE_PREFIX, 1)
else:
page_url = raw_page_url or FOSSIL_LEAF_LENS_URL
existing = entries.get(fossil_id)
if existing is None or (existing.selection == "Unlabeled" and selection != "Unlabeled"):
entries[fossil_id] = FossilEntry(
fossil_id=fossil_id,
selection=selection,
image_url=image_url,
page_url=page_url,
)
result = list(entries.values())
if shuffle:
random.shuffle(result)
return result