import json import os import random import glob import pandas as pd from typing import List, Dict, Tuple # Load the fossils paths CSV to map fossil names to URLs FOSSILS_CSV_PATH = os.path.join(os.path.dirname(__file__), "fossils_paths.csv") def get_fossil_url_from_csv(fossil_name: str) -> Tuple[str, str]: """ Search the fossils_paths.csv to find the URL and family for a fossil. Args: fossil_name: Name of the fossil (e.g., "FLFO_002787A", "CU_0387cu") Returns: Tuple of (URL to the fossil image, family name) or (None, None) if not found """ try: if not os.path.exists(FOSSILS_CSV_PATH): return None, None df = pd.read_csv(FOSSILS_CSV_PATH) # Search for the fossil name in the file_name column # CSV filenames may not include the full prefix (e.g., "FLFO_002787A" -> "002787A") # Try multiple search patterns search_patterns = [ fossil_name, # Full name fossil_name.replace("FLFO_", "").replace("CU_", ""), # Without prefix fossil_name.split("_")[-1] if "_" in fossil_name else fossil_name, # Last part after underscore ] matching_rows = None for pattern in search_patterns: mask = df['file_name'].str.contains(pattern, case=False, na=False, regex=False) if mask.sum() > 0: matching_rows = df[mask] break if matching_rows is not None and len(matching_rows) > 0: # Get the first match row = matching_rows.iloc[0] file_path = row['file_name'] family = row.get('family', 'Unknown') # Convert to public URL folder_florissant = 'https://storage.googleapis.com/serrelab/prj_fossils/2024/Florissant_Fossil_v2.0/' folder_general = 'https://storage.googleapis.com/serrelab/prj_fossils/2024/General_Fossil_v2.0/' if 'Florissant_Fossil/512/full/jpg/' in file_path: public_path = file_path.replace( '/gpfs/data/tserre/irodri15/Fossils/new_data/leavesdb-v1_1/images/Fossil/Florissant_Fossil/512/full/jpg/', folder_florissant ) return public_path, family elif 'General_Fossil/512/full/jpg/' in file_path: public_path = file_path.replace( '/gpfs/data/tserre/irodri15/Fossils/new_data/leavesdb-v1_1/images/Fossil/General_Fossil/512/full/jpg/', folder_general ) return public_path, family return None, None except Exception as e: print(f"Error searching CSV for {fossil_name}: {e}") return None, None def load_plausible_fossils(json_dir: str = None) -> List[Dict]: """ Load all JSON files from the directory and extract entries marked as 'Plausible'. Args: json_dir: Directory containing the JSON response files (defaults to ../Unknown) Returns: List of dictionaries containing plausible fossil entries """ if json_dir is None: # Default to ../Unknown relative to the fossil_app directory json_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "Unknown") plausible_fossils = [] # Find all JSON files in the directory json_files = glob.glob(os.path.join(json_dir, "unidentified_fossil_responses*.json")) for json_file in json_files: try: with open(json_file, 'r', encoding='utf-8') as f: data = json.load(f) # Filter for plausible entries for entry in data: if entry.get("User Selection") == "Plausible": plausible_fossils.append({ "Serial Number": entry.get("Serial Number"), "Fossil Name": entry.get("Fossil Name"), "Source File": os.path.basename(json_file) }) except Exception as e: print(f"Error loading {json_file}: {e}") continue # Remove duplicates based on Fossil Name seen = set() unique_fossils = [] for fossil in plausible_fossils: fossil_name = fossil["Fossil Name"] if fossil_name not in seen: seen.add(fossil_name) unique_fossils.append(fossil) return unique_fossils def get_fossil_image_url(fossil_name: str) -> Tuple[str, str]: """ Get the image URL and family for a fossil by searching the CSV file. Args: fossil_name: Name of the fossil (e.g., "FLFO_002787A", "CU_0387cu") Returns: Tuple of (URL to the fossil image, family name) or (placeholder URL, "Unknown") """ url, family = get_fossil_url_from_csv(fossil_name) if url: return url, family # Fallback: construct a basic URL (may not work for all fossils) base_florissant = "https://storage.googleapis.com/serrelab/prj_fossils/2024/Florissant_Fossil_v2.0/" return f"{base_florissant}{fossil_name}/image.jpg", "Unknown" # Placeholder def get_random_plausible_fossils(count: int = 10, json_dir: str = None) -> List[Dict]: """ Get a random selection of plausible fossils. Args: count: Number of fossils to return json_dir: Directory containing JSON files Returns: List of fossil dictionaries with image URLs """ all_plausible = load_plausible_fossils(json_dir) if len(all_plausible) <= count: selected = all_plausible else: selected = random.sample(all_plausible, count) # Add image URLs and families for fossil in selected: url, family = get_fossil_image_url(fossil["Fossil Name"]) fossil["Image URL"] = url fossil["Family"] = family return selected def get_all_plausible_fossils(json_dir: str = None) -> List[Dict]: """ Get all plausible fossils with image URLs. Args: json_dir: Directory containing JSON files Returns: List of all fossil dictionaries with image URLs """ all_plausible = load_plausible_fossils(json_dir) # Add image URLs and families for fossil in all_plausible: url, family = get_fossil_image_url(fossil["Fossil Name"]) fossil["Image URL"] = url fossil["Family"] = family return all_plausible def format_fossil_html(fossil: Dict) -> str: """ Format a fossil entry as HTML with link. Args: fossil: Dictionary containing fossil information Returns: HTML string for the fossil """ fossil_name = fossil.get("Fossil Name", "Unknown") image_url = fossil.get("Image URL", "") serial_num = fossil.get("Serial Number", "") family = fossil.get("Family", "Unknown") # Check if URL is valid (not a placeholder) has_valid_url = image_url and "image.jpg" not in image_url and image_url != "" link_html = "" if has_valid_url: link_html = f"""
""" else: link_html = """Image URL not available in database
""" html = f"""Serial Number: {serial_num}
Family: {family}
{link_html}