fossil_app / plausible_fossils.py
piperod91's picture
adding fossils
48387d7
import json
import os
import random
import glob
import pandas as pd
from typing import List, Dict, Tuple
# Load the fossils paths CSV to map fossil names to URLs
FOSSILS_CSV_PATH = os.path.join(os.path.dirname(__file__), "fossils_paths.csv")
def get_fossil_url_from_csv(fossil_name: str) -> Tuple[str, str]:
"""
Search the fossils_paths.csv to find the URL and family for a fossil.
Args:
fossil_name: Name of the fossil (e.g., "FLFO_002787A", "CU_0387cu")
Returns:
Tuple of (URL to the fossil image, family name) or (None, None) if not found
"""
try:
if not os.path.exists(FOSSILS_CSV_PATH):
return None, None
df = pd.read_csv(FOSSILS_CSV_PATH)
# Search for the fossil name in the file_name column
# CSV filenames may not include the full prefix (e.g., "FLFO_002787A" -> "002787A")
# Try multiple search patterns
search_patterns = [
fossil_name, # Full name
fossil_name.replace("FLFO_", "").replace("CU_", ""), # Without prefix
fossil_name.split("_")[-1] if "_" in fossil_name else fossil_name, # Last part after underscore
]
matching_rows = None
for pattern in search_patterns:
mask = df['file_name'].str.contains(pattern, case=False, na=False, regex=False)
if mask.sum() > 0:
matching_rows = df[mask]
break
if matching_rows is not None and len(matching_rows) > 0:
# Get the first match
row = matching_rows.iloc[0]
file_path = row['file_name']
family = row.get('family', 'Unknown')
# Convert to public URL
folder_florissant = 'https://storage.googleapis.com/serrelab/prj_fossils/2024/Florissant_Fossil_v2.0/'
folder_general = 'https://storage.googleapis.com/serrelab/prj_fossils/2024/General_Fossil_v2.0/'
if 'Florissant_Fossil/512/full/jpg/' in file_path:
public_path = file_path.replace(
'/gpfs/data/tserre/irodri15/Fossils/new_data/leavesdb-v1_1/images/Fossil/Florissant_Fossil/512/full/jpg/',
folder_florissant
)
return public_path, family
elif 'General_Fossil/512/full/jpg/' in file_path:
public_path = file_path.replace(
'/gpfs/data/tserre/irodri15/Fossils/new_data/leavesdb-v1_1/images/Fossil/General_Fossil/512/full/jpg/',
folder_general
)
return public_path, family
return None, None
except Exception as e:
print(f"Error searching CSV for {fossil_name}: {e}")
return None, None
def load_plausible_fossils(json_dir: str = None) -> List[Dict]:
"""
Load all JSON files from the directory and extract entries marked as 'Plausible'.
Args:
json_dir: Directory containing the JSON response files (defaults to ../Unknown)
Returns:
List of dictionaries containing plausible fossil entries
"""
if json_dir is None:
# Default to ../Unknown relative to the fossil_app directory
json_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "Unknown")
plausible_fossils = []
# Find all JSON files in the directory
json_files = glob.glob(os.path.join(json_dir, "unidentified_fossil_responses*.json"))
for json_file in json_files:
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Filter for plausible entries
for entry in data:
if entry.get("User Selection") == "Plausible":
plausible_fossils.append({
"Serial Number": entry.get("Serial Number"),
"Fossil Name": entry.get("Fossil Name"),
"Source File": os.path.basename(json_file)
})
except Exception as e:
print(f"Error loading {json_file}: {e}")
continue
# Remove duplicates based on Fossil Name
seen = set()
unique_fossils = []
for fossil in plausible_fossils:
fossil_name = fossil["Fossil Name"]
if fossil_name not in seen:
seen.add(fossil_name)
unique_fossils.append(fossil)
return unique_fossils
def get_fossil_image_url(fossil_name: str) -> Tuple[str, str]:
"""
Get the image URL and family for a fossil by searching the CSV file.
Args:
fossil_name: Name of the fossil (e.g., "FLFO_002787A", "CU_0387cu")
Returns:
Tuple of (URL to the fossil image, family name) or (placeholder URL, "Unknown")
"""
url, family = get_fossil_url_from_csv(fossil_name)
if url:
return url, family
# Fallback: construct a basic URL (may not work for all fossils)
base_florissant = "https://storage.googleapis.com/serrelab/prj_fossils/2024/Florissant_Fossil_v2.0/"
return f"{base_florissant}{fossil_name}/image.jpg", "Unknown" # Placeholder
def get_random_plausible_fossils(count: int = 10, json_dir: str = None) -> List[Dict]:
"""
Get a random selection of plausible fossils.
Args:
count: Number of fossils to return
json_dir: Directory containing JSON files
Returns:
List of fossil dictionaries with image URLs
"""
all_plausible = load_plausible_fossils(json_dir)
if len(all_plausible) <= count:
selected = all_plausible
else:
selected = random.sample(all_plausible, count)
# Add image URLs and families
for fossil in selected:
url, family = get_fossil_image_url(fossil["Fossil Name"])
fossil["Image URL"] = url
fossil["Family"] = family
return selected
def get_all_plausible_fossils(json_dir: str = None) -> List[Dict]:
"""
Get all plausible fossils with image URLs.
Args:
json_dir: Directory containing JSON files
Returns:
List of all fossil dictionaries with image URLs
"""
all_plausible = load_plausible_fossils(json_dir)
# Add image URLs and families
for fossil in all_plausible:
url, family = get_fossil_image_url(fossil["Fossil Name"])
fossil["Image URL"] = url
fossil["Family"] = family
return all_plausible
def format_fossil_html(fossil: Dict) -> str:
"""
Format a fossil entry as HTML with link.
Args:
fossil: Dictionary containing fossil information
Returns:
HTML string for the fossil
"""
fossil_name = fossil.get("Fossil Name", "Unknown")
image_url = fossil.get("Image URL", "")
serial_num = fossil.get("Serial Number", "")
family = fossil.get("Family", "Unknown")
# Check if URL is valid (not a placeholder)
has_valid_url = image_url and "image.jpg" not in image_url and image_url != ""
link_html = ""
if has_valid_url:
link_html = f"""
<p style='margin: 10px 0 0 0;'>
<a href='{image_url}' target='_blank' style='color: #0066cc; text-decoration: none; font-weight: bold; padding: 8px 15px; background-color: #e3f2fd; border-radius: 4px; display: inline-block;'>
🔗 View Image →
</a>
</p>
"""
else:
link_html = """
<p style='margin: 10px 0 0 0; color: #666; font-style: italic;'>
Image URL not available in database
</p>
"""
html = f"""
<div style='border: 1px solid #ddd; padding: 15px; margin: 10px; border-radius: 8px; background-color: #fafafa; box-shadow: 0 2px 4px rgba(0,0,0,0.1);'>
<h3 style='margin-top: 0; margin-bottom: 10px; color: #333;'>{fossil_name}</h3>
<p style='margin: 5px 0;'><strong>Serial Number:</strong> {serial_num}</p>
<p style='margin: 5px 0;'><strong>Family:</strong> {family}</p>
{link_html}
</div>
"""
return html