Spaces:

Serrelab
/

fossil_app

Sleeping

App Files Files Community

fossil_app / create_fossil_csv.py

piperod91

Update closest images display: convert gallery to table format with full specimen names

8dc677a 10 days ago

raw

history blame contribute delete

7.22 kB

	#!/usr/bin/env python3
	"""
	Create CSV file from JSON response files and HTML image links.

	This script:
	1. Parses all JSON files from ../Unknown/ directory
	2. Filters entries where User Selection is not null and not "Not Applicable"
	3. Extracts image URLs from index.html by matching fossil names
	4. Creates a CSV file with Serial Number, Fossil Name, User Selection, and Image URL
	"""

	import json
	import os
	import glob
	import re
	import csv
	from typing import List, Dict, Optional
	from collections import defaultdict

	# Paths
	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	UNKNOWN_DIR = os.path.join(os.path.dirname(SCRIPT_DIR), "Unknown")
	HTML_FILE = os.path.join(UNKNOWN_DIR, "index.html")
	OUTPUT_CSV = os.path.join(SCRIPT_DIR, "fossil_responses_with_images.csv")


	def load_json_files(json_dir: str) -> List[Dict]:
	"""
	Load all JSON response files and extract valid entries.

	Args:
	json_dir: Directory containing JSON files

	Returns:
	List of dictionaries with Serial Number, Fossil Name, User Selection
	"""
	all_entries = []
	json_files = glob.glob(os.path.join(json_dir, "unidentified_fossil_responses*.json"))

	print(f"Found {len(json_files)} JSON files")

	for json_file in json_files:
	try:
	with open(json_file, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# Filter entries: exclude null, "Not Applicable", and "Impossible"
	valid_entries = [
	entry for entry in data
	if entry.get("User Selection") is not None
	and entry.get("User Selection") != "Not Applicable"
	and entry.get("User Selection") != "Impossible"
	]

	all_entries.extend(valid_entries)
	print(f" {os.path.basename(json_file)}: {len(valid_entries)} valid entries")

	except Exception as e:
	print(f"Error loading {json_file}: {e}")
	continue

	# Remove duplicates based on Serial Number and Fossil Name combination
	seen = set()
	unique_entries = []
	for entry in all_entries:
	key = (entry.get("Serial Number"), entry.get("Fossil Name"))
	if key not in seen:
	seen.add(key)
	unique_entries.append(entry)
	else:
	# If duplicate, keep the one with a User Selection (prefer non-null)
	existing_idx = next(i for i, e in enumerate(unique_entries) if (e.get("Serial Number"), e.get("Fossil Name")) == key)
	if entry.get("User Selection") and not unique_entries[existing_idx].get("User Selection"):
	unique_entries[existing_idx] = entry

	print(f"\nTotal unique valid entries: {len(unique_entries)}")
	return unique_entries


	def extract_image_urls_from_html(html_file: str) -> Dict[str, str]:
	"""
	Extract image URLs from HTML file by parsing img tags with fossil-image class.
	Uses line-by-line parsing for better performance on large files.

	Args:
	html_file: Path to index.html file

	Returns:
	Dictionary mapping fossil_name to image_url
	"""
	image_urls = {}

	if not os.path.exists(html_file):
	print(f"Warning: HTML file not found: {html_file}")
	return image_urls

	print(f"\nParsing HTML file: {html_file}")

	try:
	# Use line-by-line parsing for better performance
	current_fossil_name = None

	with open(html_file, 'r', encoding='utf-8') as f:
	for line in f:
	# Look for fossil name in fossil-name td
	fossil_name_match = re.search(r'<td class="fossil-name">.?>(.?)</a>', line)
	if fossil_name_match:
	fossil_name = re.sub(r'<[^>]+>', '', fossil_name_match.group(1)).strip()
	if fossil_name:
	current_fossil_name = fossil_name

	# Look for image URL in fossil-image td
	image_match = re.search(r'<td class="fossil-image"><img src="(https://storage\.googleapis\.com/serrelab/fossil_lens/inference_concepts2/[^"]+)"', line)
	if image_match and current_fossil_name:
	image_url = image_match.group(1)
	image_urls[current_fossil_name] = image_url
	current_fossil_name = None # Reset after finding image

	print(f"Extracted {len(image_urls)} image URLs from HTML")

	except Exception as e:
	print(f"Error parsing HTML file: {e}")
	import traceback
	traceback.print_exc()

	return image_urls


	def create_csv(entries: List[Dict], image_urls: Dict[str, str], output_file: str):
	"""
	Create CSV file with fossil data and image URLs.

	Args:
	entries: List of fossil entries from JSON files
	image_urls: Dictionary mapping fossil names to image URLs
	output_file: Path to output CSV file
	"""
	print(f"\nCreating CSV file: {output_file}")

	# Sort entries by Serial Number
	entries.sort(key=lambda x: x.get("Serial Number", 0))

	# Count matches
	matched_count = 0
	unmatched_count = 0

	with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
	fieldnames = ['Serial Number', 'Fossil Name', 'User Selection', 'Image URL']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	writer.writeheader()

	for entry in entries:
	fossil_name = entry.get("Fossil Name", "")
	image_url = image_urls.get(fossil_name, "")

	if image_url:
	matched_count += 1
	else:
	unmatched_count += 1

	# Format as Excel hyperlink: =HYPERLINK("url", "View Image")
	if image_url:
	formatted_url = f'=HYPERLINK("{image_url}", "View Image")'
	else:
	formatted_url = ""

	writer.writerow({
	'Serial Number': entry.get("Serial Number", ""),
	'Fossil Name': fossil_name,
	'User Selection': entry.get("User Selection", ""),
	'Image URL': formatted_url
	})

	print(f"\nCSV file created successfully!")
	print(f" Total entries: {len(entries)}")
	print(f" Matched with image URLs: {matched_count}")
	print(f" Unmatched (no image URL): {unmatched_count}")


	def main():
	"""Main function to orchestrate the CSV creation process."""
	print("=" * 60)
	print("Creating CSV from JSON responses and HTML image links")
	print("=" * 60)

	# Step 1: Load and filter JSON files
	entries = load_json_files(UNKNOWN_DIR)

	if not entries:
	print("No valid entries found. Exiting.")
	return

	# Step 2: Extract image URLs from HTML
	image_urls = extract_image_urls_from_html(HTML_FILE)

	# Step 3: Create CSV file
	create_csv(entries, image_urls, OUTPUT_CSV)

	print("\n" + "=" * 60)
	print(f"Done! Output file: {OUTPUT_CSV}")
	print("=" * 60)


	if __name__ == "__main__":
	main()