#!/usr/bin/env python3 """ Create CSV file from JSON response files and HTML image links. This script: 1. Parses all JSON files from ../Unknown/ directory 2. Filters entries where User Selection is not null and not "Not Applicable" 3. Extracts image URLs from index.html by matching fossil names 4. Creates a CSV file with Serial Number, Fossil Name, User Selection, and Image URL """ import json import os import glob import re import csv from typing import List, Dict, Optional from collections import defaultdict # Paths SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) UNKNOWN_DIR = os.path.join(os.path.dirname(SCRIPT_DIR), "Unknown") HTML_FILE = os.path.join(UNKNOWN_DIR, "index.html") OUTPUT_CSV = os.path.join(SCRIPT_DIR, "fossil_responses_with_images.csv") def load_json_files(json_dir: str) -> List[Dict]: """ Load all JSON response files and extract valid entries. Args: json_dir: Directory containing JSON files Returns: List of dictionaries with Serial Number, Fossil Name, User Selection """ all_entries = [] json_files = glob.glob(os.path.join(json_dir, "unidentified_fossil_responses*.json")) print(f"Found {len(json_files)} JSON files") for json_file in json_files: try: with open(json_file, 'r', encoding='utf-8') as f: data = json.load(f) # Filter entries: exclude null, "Not Applicable", and "Impossible" valid_entries = [ entry for entry in data if entry.get("User Selection") is not None and entry.get("User Selection") != "Not Applicable" and entry.get("User Selection") != "Impossible" ] all_entries.extend(valid_entries) print(f" {os.path.basename(json_file)}: {len(valid_entries)} valid entries") except Exception as e: print(f"Error loading {json_file}: {e}") continue # Remove duplicates based on Serial Number and Fossil Name combination seen = set() unique_entries = [] for entry in all_entries: key = (entry.get("Serial Number"), entry.get("Fossil Name")) if key not in seen: seen.add(key) unique_entries.append(entry) else: # If duplicate, keep the one with a User Selection (prefer non-null) existing_idx = next(i for i, e in enumerate(unique_entries) if (e.get("Serial Number"), e.get("Fossil Name")) == key) if entry.get("User Selection") and not unique_entries[existing_idx].get("User Selection"): unique_entries[existing_idx] = entry print(f"\nTotal unique valid entries: {len(unique_entries)}") return unique_entries def extract_image_urls_from_html(html_file: str) -> Dict[str, str]: """ Extract image URLs from HTML file by parsing img tags with fossil-image class. Uses line-by-line parsing for better performance on large files. Args: html_file: Path to index.html file Returns: Dictionary mapping fossil_name to image_url """ image_urls = {} if not os.path.exists(html_file): print(f"Warning: HTML file not found: {html_file}") return image_urls print(f"\nParsing HTML file: {html_file}") try: # Use line-by-line parsing for better performance current_fossil_name = None with open(html_file, 'r', encoding='utf-8') as f: for line in f: # Look for fossil name in fossil-name td fossil_name_match = re.search(r'.*?>(.*?)', line) if fossil_name_match: fossil_name = re.sub(r'<[^>]+>', '', fossil_name_match.group(1)).strip() if fossil_name: current_fossil_name = fossil_name # Look for image URL in fossil-image td image_match = re.search(r'