Spaces:

Serrelab
/

fossil_app

Running on CPU Upgrade

App Files Files Community

fossil_app / fossil_library.py

piperod91

Fossil links: map old prj_fossil_unknown pages to new FossilLeafLens pages/ pattern

5f3d314 about 2 months ago

raw

history blame contribute delete

2.79 kB

	import os
	import random
	import csv
	from dataclasses import dataclass
	from typing import Literal


	Selection = Literal["Plausible", "Not Sure", "Impossible", "Unlabeled"]


	@dataclass(frozen=True)
	class FossilEntry:
	fossil_id: str
	selection: Selection
	image_url: str
	page_url: str


	FOSSIL_LEAF_LENS_URL = "https://serre-lab.github.io/FossilLeafLens/"
	_OLD_PAGE_PREFIX = "https://serre-lab.github.io/prj_fossil_unknown/pages/unknown/"
	_NEW_PAGE_PREFIX = "https://serre-lab.github.io/FossilLeafLens/pages/"
	INFERENCE_IMAGE_BASE_URL = "https://storage.googleapis.com/serrelab/fossil_lens/inference_concepts2/"


	def load_fossil_entries_from_csv(csv_path: str, *, shuffle: bool = True) -> list[FossilEntry]:
	"""
	Load Florissant unidentified fossils from the flattened CSV, excluding Not Applicable entries.

	Expected columns:
	- fossil_id
	- user_selection
	- is_applicable (0/1)
	- image_url
	- page_url
	"""
	if not os.path.exists(csv_path):
	raise FileNotFoundError(f"Fossil library CSV not found: {csv_path}")

	entries: dict[str, FossilEntry] = {}

	with open(csv_path, "r", encoding="utf-8") as f:
	reader = csv.DictReader(f)
	for row in reader:
	fossil_id = (row.get("fossil_id") or "").strip()
	if not fossil_id:
	continue
	is_applicable = (row.get("is_applicable") or "").strip()
	if is_applicable == "0":
	continue # Exclude Not Applicable

	selection_raw = (row.get("user_selection") or "").strip()
	selection: Selection
	if selection_raw in ("Plausible", "Not Sure", "Impossible", "Unlabeled"):
	selection = selection_raw # type: ignore[assignment]
	else:
	selection = "Unlabeled"

	image_url = (row.get("image_url") or f"{INFERENCE_IMAGE_BASE_URL}{fossil_id}/image.jpg").strip()
	# Map old prj_fossil_unknown pages to the new FossilLeafLens pages pattern.
	raw_page_url = (row.get("page_url") or "").strip()
	if raw_page_url.startswith(_OLD_PAGE_PREFIX):
	page_url = raw_page_url.replace(_OLD_PAGE_PREFIX, _NEW_PAGE_PREFIX, 1)
	else:
	page_url = raw_page_url or FOSSIL_LEAF_LENS_URL

	existing = entries.get(fossil_id)
	if existing is None or (existing.selection == "Unlabeled" and selection != "Unlabeled"):
	entries[fossil_id] = FossilEntry(
	fossil_id=fossil_id,
	selection=selection,
	image_url=image_url,
	page_url=page_url,
	)

	result = list(entries.values())
	if shuffle:
	random.shuffle(result)
	return result