Spaces:

Serrelab
/

fossil_app

Sleeping

App Files Files Community

fossil_app / plausible_fossils.py

piperod91

adding fossils

48387d7 19 days ago

raw

history blame contribute delete

8.16 kB

	import json
	import os
	import random
	import glob
	import pandas as pd
	from typing import List, Dict, Tuple

	# Load the fossils paths CSV to map fossil names to URLs
	FOSSILS_CSV_PATH = os.path.join(os.path.dirname(__file__), "fossils_paths.csv")

	def get_fossil_url_from_csv(fossil_name: str) -> Tuple[str, str]:
	"""
	Search the fossils_paths.csv to find the URL and family for a fossil.

	Args:
	fossil_name: Name of the fossil (e.g., "FLFO_002787A", "CU_0387cu")

	Returns:
	Tuple of (URL to the fossil image, family name) or (None, None) if not found
	"""
	try:
	if not os.path.exists(FOSSILS_CSV_PATH):
	return None, None

	df = pd.read_csv(FOSSILS_CSV_PATH)

	# Search for the fossil name in the file_name column
	# CSV filenames may not include the full prefix (e.g., "FLFO_002787A" -> "002787A")
	# Try multiple search patterns
	search_patterns = [
	fossil_name, # Full name
	fossil_name.replace("FLFO_", "").replace("CU_", ""), # Without prefix
	fossil_name.split("_")[-1] if "_" in fossil_name else fossil_name, # Last part after underscore
	]

	matching_rows = None
	for pattern in search_patterns:
	mask = df['file_name'].str.contains(pattern, case=False, na=False, regex=False)
	if mask.sum() > 0:
	matching_rows = df[mask]
	break

	if matching_rows is not None and len(matching_rows) > 0:
	# Get the first match
	row = matching_rows.iloc[0]
	file_path = row['file_name']
	family = row.get('family', 'Unknown')

	# Convert to public URL
	folder_florissant = 'https://storage.googleapis.com/serrelab/prj_fossils/2024/Florissant_Fossil_v2.0/'
	folder_general = 'https://storage.googleapis.com/serrelab/prj_fossils/2024/General_Fossil_v2.0/'

	if 'Florissant_Fossil/512/full/jpg/' in file_path:
	public_path = file_path.replace(
	'/gpfs/data/tserre/irodri15/Fossils/new_data/leavesdb-v1_1/images/Fossil/Florissant_Fossil/512/full/jpg/',
	folder_florissant
	)
	return public_path, family
	elif 'General_Fossil/512/full/jpg/' in file_path:
	public_path = file_path.replace(
	'/gpfs/data/tserre/irodri15/Fossils/new_data/leavesdb-v1_1/images/Fossil/General_Fossil/512/full/jpg/',
	folder_general
	)
	return public_path, family

	return None, None
	except Exception as e:
	print(f"Error searching CSV for {fossil_name}: {e}")
	return None, None

	def load_plausible_fossils(json_dir: str = None) -> List[Dict]:
	"""
	Load all JSON files from the directory and extract entries marked as 'Plausible'.

	Args:
	json_dir: Directory containing the JSON response files (defaults to ../Unknown)

	Returns:
	List of dictionaries containing plausible fossil entries
	"""
	if json_dir is None:
	# Default to ../Unknown relative to the fossil_app directory
	json_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "Unknown")

	plausible_fossils = []

	# Find all JSON files in the directory
	json_files = glob.glob(os.path.join(json_dir, "unidentified_fossil_responses*.json"))

	for json_file in json_files:
	try:
	with open(json_file, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# Filter for plausible entries
	for entry in data:
	if entry.get("User Selection") == "Plausible":
	plausible_fossils.append({
	"Serial Number": entry.get("Serial Number"),
	"Fossil Name": entry.get("Fossil Name"),
	"Source File": os.path.basename(json_file)
	})
	except Exception as e:
	print(f"Error loading {json_file}: {e}")
	continue

	# Remove duplicates based on Fossil Name
	seen = set()
	unique_fossils = []
	for fossil in plausible_fossils:
	fossil_name = fossil["Fossil Name"]
	if fossil_name not in seen:
	seen.add(fossil_name)
	unique_fossils.append(fossil)

	return unique_fossils

	def get_fossil_image_url(fossil_name: str) -> Tuple[str, str]:
	"""
	Get the image URL and family for a fossil by searching the CSV file.

	Args:
	fossil_name: Name of the fossil (e.g., "FLFO_002787A", "CU_0387cu")

	Returns:
	Tuple of (URL to the fossil image, family name) or (placeholder URL, "Unknown")
	"""
	url, family = get_fossil_url_from_csv(fossil_name)
	if url:
	return url, family

	# Fallback: construct a basic URL (may not work for all fossils)
	base_florissant = "https://storage.googleapis.com/serrelab/prj_fossils/2024/Florissant_Fossil_v2.0/"
	return f"{base_florissant}{fossil_name}/image.jpg", "Unknown" # Placeholder

	def get_random_plausible_fossils(count: int = 10, json_dir: str = None) -> List[Dict]:
	"""
	Get a random selection of plausible fossils.

	Args:
	count: Number of fossils to return
	json_dir: Directory containing JSON files

	Returns:
	List of fossil dictionaries with image URLs
	"""
	all_plausible = load_plausible_fossils(json_dir)

	if len(all_plausible) <= count:
	selected = all_plausible
	else:
	selected = random.sample(all_plausible, count)

	# Add image URLs and families
	for fossil in selected:
	url, family = get_fossil_image_url(fossil["Fossil Name"])
	fossil["Image URL"] = url
	fossil["Family"] = family

	return selected

	def get_all_plausible_fossils(json_dir: str = None) -> List[Dict]:
	"""
	Get all plausible fossils with image URLs.

	Args:
	json_dir: Directory containing JSON files

	Returns:
	List of all fossil dictionaries with image URLs
	"""
	all_plausible = load_plausible_fossils(json_dir)

	# Add image URLs and families
	for fossil in all_plausible:
	url, family = get_fossil_image_url(fossil["Fossil Name"])
	fossil["Image URL"] = url
	fossil["Family"] = family

	return all_plausible

	def format_fossil_html(fossil: Dict) -> str:
	"""
	Format a fossil entry as HTML with link.

	Args:
	fossil: Dictionary containing fossil information

	Returns:
	HTML string for the fossil
	"""
	fossil_name = fossil.get("Fossil Name", "Unknown")
	image_url = fossil.get("Image URL", "")
	serial_num = fossil.get("Serial Number", "")
	family = fossil.get("Family", "Unknown")

	# Check if URL is valid (not a placeholder)
	has_valid_url = image_url and "image.jpg" not in image_url and image_url != ""

	link_html = ""
	if has_valid_url:
	link_html = f"""
	<p style='margin: 10px 0 0 0;'>
	<a href='{image_url}' target='_blank' style='color: #0066cc; text-decoration: none; font-weight: bold; padding: 8px 15px; background-color: #e3f2fd; border-radius: 4px; display: inline-block;'>
	🔗 View Image →
	</a>
	</p>
	"""
	else:
	link_html = """
	<p style='margin: 10px 0 0 0; color: #666; font-style: italic;'>
	Image URL not available in database
	</p>
	"""

	html = f"""
	<div style='border: 1px solid #ddd; padding: 15px; margin: 10px; border-radius: 8px; background-color: #fafafa; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
	<h3 style='margin-top: 0; margin-bottom: 10px; color: #333;'>{fossil_name}</h3>
	<p style='margin: 5px 0;'><strong>Serial Number:</strong> {serial_num}</p>
	<p style='margin: 5px 0;'><strong>Family:</strong> {family}</p>
	{link_html}
	</div>
	"""
	return html