Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| import os | |
| import random | |
| import csv | |
| from dataclasses import dataclass | |
| from typing import Literal | |
| Selection = Literal["Plausible", "Not Sure", "Impossible", "Unlabeled"] | |
| class FossilEntry: | |
| fossil_id: str | |
| selection: Selection | |
| image_url: str | |
| page_url: str | |
| FOSSIL_LEAF_LENS_URL = "https://serre-lab.github.io/FossilLeafLens/" | |
| _OLD_PAGE_PREFIX = "https://serre-lab.github.io/prj_fossil_unknown/pages/unknown/" | |
| _NEW_PAGE_PREFIX = "https://serre-lab.github.io/FossilLeafLens/pages/" | |
| INFERENCE_IMAGE_BASE_URL = "https://storage.googleapis.com/serrelab/fossil_lens/inference_concepts2/" | |
| def load_fossil_entries_from_csv(csv_path: str, *, shuffle: bool = True) -> list[FossilEntry]: | |
| """ | |
| Load Florissant unidentified fossils from the flattened CSV, excluding Not Applicable entries. | |
| Expected columns: | |
| - fossil_id | |
| - user_selection | |
| - is_applicable (0/1) | |
| - image_url | |
| - page_url | |
| """ | |
| if not os.path.exists(csv_path): | |
| raise FileNotFoundError(f"Fossil library CSV not found: {csv_path}") | |
| entries: dict[str, FossilEntry] = {} | |
| with open(csv_path, "r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| fossil_id = (row.get("fossil_id") or "").strip() | |
| if not fossil_id: | |
| continue | |
| is_applicable = (row.get("is_applicable") or "").strip() | |
| if is_applicable == "0": | |
| continue # Exclude Not Applicable | |
| selection_raw = (row.get("user_selection") or "").strip() | |
| selection: Selection | |
| if selection_raw in ("Plausible", "Not Sure", "Impossible", "Unlabeled"): | |
| selection = selection_raw # type: ignore[assignment] | |
| else: | |
| selection = "Unlabeled" | |
| image_url = (row.get("image_url") or f"{INFERENCE_IMAGE_BASE_URL}{fossil_id}/image.jpg").strip() | |
| # Map old prj_fossil_unknown pages to the new FossilLeafLens pages pattern. | |
| raw_page_url = (row.get("page_url") or "").strip() | |
| if raw_page_url.startswith(_OLD_PAGE_PREFIX): | |
| page_url = raw_page_url.replace(_OLD_PAGE_PREFIX, _NEW_PAGE_PREFIX, 1) | |
| else: | |
| page_url = raw_page_url or FOSSIL_LEAF_LENS_URL | |
| existing = entries.get(fossil_id) | |
| if existing is None or (existing.selection == "Unlabeled" and selection != "Unlabeled"): | |
| entries[fossil_id] = FossilEntry( | |
| fossil_id=fossil_id, | |
| selection=selection, | |
| image_url=image_url, | |
| page_url=page_url, | |
| ) | |
| result = list(entries.values()) | |
| if shuffle: | |
| random.shuffle(result) | |
| return result | |