fossil_app / create_fossil_csv.py
piperod91's picture
Update closest images display: convert gallery to table format with full specimen names
8dc677a
#!/usr/bin/env python3
"""
Create CSV file from JSON response files and HTML image links.
This script:
1. Parses all JSON files from ../Unknown/ directory
2. Filters entries where User Selection is not null and not "Not Applicable"
3. Extracts image URLs from index.html by matching fossil names
4. Creates a CSV file with Serial Number, Fossil Name, User Selection, and Image URL
"""
import json
import os
import glob
import re
import csv
from typing import List, Dict, Optional
from collections import defaultdict
# Paths
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
UNKNOWN_DIR = os.path.join(os.path.dirname(SCRIPT_DIR), "Unknown")
HTML_FILE = os.path.join(UNKNOWN_DIR, "index.html")
OUTPUT_CSV = os.path.join(SCRIPT_DIR, "fossil_responses_with_images.csv")
def load_json_files(json_dir: str) -> List[Dict]:
"""
Load all JSON response files and extract valid entries.
Args:
json_dir: Directory containing JSON files
Returns:
List of dictionaries with Serial Number, Fossil Name, User Selection
"""
all_entries = []
json_files = glob.glob(os.path.join(json_dir, "unidentified_fossil_responses*.json"))
print(f"Found {len(json_files)} JSON files")
for json_file in json_files:
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Filter entries: exclude null, "Not Applicable", and "Impossible"
valid_entries = [
entry for entry in data
if entry.get("User Selection") is not None
and entry.get("User Selection") != "Not Applicable"
and entry.get("User Selection") != "Impossible"
]
all_entries.extend(valid_entries)
print(f" {os.path.basename(json_file)}: {len(valid_entries)} valid entries")
except Exception as e:
print(f"Error loading {json_file}: {e}")
continue
# Remove duplicates based on Serial Number and Fossil Name combination
seen = set()
unique_entries = []
for entry in all_entries:
key = (entry.get("Serial Number"), entry.get("Fossil Name"))
if key not in seen:
seen.add(key)
unique_entries.append(entry)
else:
# If duplicate, keep the one with a User Selection (prefer non-null)
existing_idx = next(i for i, e in enumerate(unique_entries) if (e.get("Serial Number"), e.get("Fossil Name")) == key)
if entry.get("User Selection") and not unique_entries[existing_idx].get("User Selection"):
unique_entries[existing_idx] = entry
print(f"\nTotal unique valid entries: {len(unique_entries)}")
return unique_entries
def extract_image_urls_from_html(html_file: str) -> Dict[str, str]:
"""
Extract image URLs from HTML file by parsing img tags with fossil-image class.
Uses line-by-line parsing for better performance on large files.
Args:
html_file: Path to index.html file
Returns:
Dictionary mapping fossil_name to image_url
"""
image_urls = {}
if not os.path.exists(html_file):
print(f"Warning: HTML file not found: {html_file}")
return image_urls
print(f"\nParsing HTML file: {html_file}")
try:
# Use line-by-line parsing for better performance
current_fossil_name = None
with open(html_file, 'r', encoding='utf-8') as f:
for line in f:
# Look for fossil name in fossil-name td
fossil_name_match = re.search(r'<td class="fossil-name">.*?>(.*?)</a>', line)
if fossil_name_match:
fossil_name = re.sub(r'<[^>]+>', '', fossil_name_match.group(1)).strip()
if fossil_name:
current_fossil_name = fossil_name
# Look for image URL in fossil-image td
image_match = re.search(r'<td class="fossil-image"><img src="(https://storage\.googleapis\.com/serrelab/fossil_lens/inference_concepts2/[^"]+)"', line)
if image_match and current_fossil_name:
image_url = image_match.group(1)
image_urls[current_fossil_name] = image_url
current_fossil_name = None # Reset after finding image
print(f"Extracted {len(image_urls)} image URLs from HTML")
except Exception as e:
print(f"Error parsing HTML file: {e}")
import traceback
traceback.print_exc()
return image_urls
def create_csv(entries: List[Dict], image_urls: Dict[str, str], output_file: str):
"""
Create CSV file with fossil data and image URLs.
Args:
entries: List of fossil entries from JSON files
image_urls: Dictionary mapping fossil names to image URLs
output_file: Path to output CSV file
"""
print(f"\nCreating CSV file: {output_file}")
# Sort entries by Serial Number
entries.sort(key=lambda x: x.get("Serial Number", 0))
# Count matches
matched_count = 0
unmatched_count = 0
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Serial Number', 'Fossil Name', 'User Selection', 'Image URL']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for entry in entries:
fossil_name = entry.get("Fossil Name", "")
image_url = image_urls.get(fossil_name, "")
if image_url:
matched_count += 1
else:
unmatched_count += 1
# Format as Excel hyperlink: =HYPERLINK("url", "View Image")
if image_url:
formatted_url = f'=HYPERLINK("{image_url}", "View Image")'
else:
formatted_url = ""
writer.writerow({
'Serial Number': entry.get("Serial Number", ""),
'Fossil Name': fossil_name,
'User Selection': entry.get("User Selection", ""),
'Image URL': formatted_url
})
print(f"\nCSV file created successfully!")
print(f" Total entries: {len(entries)}")
print(f" Matched with image URLs: {matched_count}")
print(f" Unmatched (no image URL): {unmatched_count}")
def main():
"""Main function to orchestrate the CSV creation process."""
print("=" * 60)
print("Creating CSV from JSON responses and HTML image links")
print("=" * 60)
# Step 1: Load and filter JSON files
entries = load_json_files(UNKNOWN_DIR)
if not entries:
print("No valid entries found. Exiting.")
return
# Step 2: Extract image URLs from HTML
image_urls = extract_image_urls_from_html(HTML_FILE)
# Step 3: Create CSV file
create_csv(entries, image_urls, OUTPUT_CSV)
print("\n" + "=" * 60)
print(f"Done! Output file: {OUTPUT_CSV}")
print("=" * 60)
if __name__ == "__main__":
main()