Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Create CSV file from JSON response files and HTML image links. | |
| This script: | |
| 1. Parses all JSON files from ../Unknown/ directory | |
| 2. Filters entries where User Selection is not null and not "Not Applicable" | |
| 3. Extracts image URLs from index.html by matching fossil names | |
| 4. Creates a CSV file with Serial Number, Fossil Name, User Selection, and Image URL | |
| """ | |
| import json | |
| import os | |
| import glob | |
| import re | |
| import csv | |
| from typing import List, Dict, Optional | |
| from collections import defaultdict | |
| # Paths | |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| UNKNOWN_DIR = os.path.join(os.path.dirname(SCRIPT_DIR), "Unknown") | |
| HTML_FILE = os.path.join(UNKNOWN_DIR, "index.html") | |
| OUTPUT_CSV = os.path.join(SCRIPT_DIR, "fossil_responses_with_images.csv") | |
| def load_json_files(json_dir: str) -> List[Dict]: | |
| """ | |
| Load all JSON response files and extract valid entries. | |
| Args: | |
| json_dir: Directory containing JSON files | |
| Returns: | |
| List of dictionaries with Serial Number, Fossil Name, User Selection | |
| """ | |
| all_entries = [] | |
| json_files = glob.glob(os.path.join(json_dir, "unidentified_fossil_responses*.json")) | |
| print(f"Found {len(json_files)} JSON files") | |
| for json_file in json_files: | |
| try: | |
| with open(json_file, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| # Filter entries: exclude null, "Not Applicable", and "Impossible" | |
| valid_entries = [ | |
| entry for entry in data | |
| if entry.get("User Selection") is not None | |
| and entry.get("User Selection") != "Not Applicable" | |
| and entry.get("User Selection") != "Impossible" | |
| ] | |
| all_entries.extend(valid_entries) | |
| print(f" {os.path.basename(json_file)}: {len(valid_entries)} valid entries") | |
| except Exception as e: | |
| print(f"Error loading {json_file}: {e}") | |
| continue | |
| # Remove duplicates based on Serial Number and Fossil Name combination | |
| seen = set() | |
| unique_entries = [] | |
| for entry in all_entries: | |
| key = (entry.get("Serial Number"), entry.get("Fossil Name")) | |
| if key not in seen: | |
| seen.add(key) | |
| unique_entries.append(entry) | |
| else: | |
| # If duplicate, keep the one with a User Selection (prefer non-null) | |
| existing_idx = next(i for i, e in enumerate(unique_entries) if (e.get("Serial Number"), e.get("Fossil Name")) == key) | |
| if entry.get("User Selection") and not unique_entries[existing_idx].get("User Selection"): | |
| unique_entries[existing_idx] = entry | |
| print(f"\nTotal unique valid entries: {len(unique_entries)}") | |
| return unique_entries | |
| def extract_image_urls_from_html(html_file: str) -> Dict[str, str]: | |
| """ | |
| Extract image URLs from HTML file by parsing img tags with fossil-image class. | |
| Uses line-by-line parsing for better performance on large files. | |
| Args: | |
| html_file: Path to index.html file | |
| Returns: | |
| Dictionary mapping fossil_name to image_url | |
| """ | |
| image_urls = {} | |
| if not os.path.exists(html_file): | |
| print(f"Warning: HTML file not found: {html_file}") | |
| return image_urls | |
| print(f"\nParsing HTML file: {html_file}") | |
| try: | |
| # Use line-by-line parsing for better performance | |
| current_fossil_name = None | |
| with open(html_file, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| # Look for fossil name in fossil-name td | |
| fossil_name_match = re.search(r'<td class="fossil-name">.*?>(.*?)</a>', line) | |
| if fossil_name_match: | |
| fossil_name = re.sub(r'<[^>]+>', '', fossil_name_match.group(1)).strip() | |
| if fossil_name: | |
| current_fossil_name = fossil_name | |
| # Look for image URL in fossil-image td | |
| image_match = re.search(r'<td class="fossil-image"><img src="(https://storage\.googleapis\.com/serrelab/fossil_lens/inference_concepts2/[^"]+)"', line) | |
| if image_match and current_fossil_name: | |
| image_url = image_match.group(1) | |
| image_urls[current_fossil_name] = image_url | |
| current_fossil_name = None # Reset after finding image | |
| print(f"Extracted {len(image_urls)} image URLs from HTML") | |
| except Exception as e: | |
| print(f"Error parsing HTML file: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return image_urls | |
| def create_csv(entries: List[Dict], image_urls: Dict[str, str], output_file: str): | |
| """ | |
| Create CSV file with fossil data and image URLs. | |
| Args: | |
| entries: List of fossil entries from JSON files | |
| image_urls: Dictionary mapping fossil names to image URLs | |
| output_file: Path to output CSV file | |
| """ | |
| print(f"\nCreating CSV file: {output_file}") | |
| # Sort entries by Serial Number | |
| entries.sort(key=lambda x: x.get("Serial Number", 0)) | |
| # Count matches | |
| matched_count = 0 | |
| unmatched_count = 0 | |
| with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: | |
| fieldnames = ['Serial Number', 'Fossil Name', 'User Selection', 'Image URL'] | |
| writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| writer.writeheader() | |
| for entry in entries: | |
| fossil_name = entry.get("Fossil Name", "") | |
| image_url = image_urls.get(fossil_name, "") | |
| if image_url: | |
| matched_count += 1 | |
| else: | |
| unmatched_count += 1 | |
| # Format as Excel hyperlink: =HYPERLINK("url", "View Image") | |
| if image_url: | |
| formatted_url = f'=HYPERLINK("{image_url}", "View Image")' | |
| else: | |
| formatted_url = "" | |
| writer.writerow({ | |
| 'Serial Number': entry.get("Serial Number", ""), | |
| 'Fossil Name': fossil_name, | |
| 'User Selection': entry.get("User Selection", ""), | |
| 'Image URL': formatted_url | |
| }) | |
| print(f"\nCSV file created successfully!") | |
| print(f" Total entries: {len(entries)}") | |
| print(f" Matched with image URLs: {matched_count}") | |
| print(f" Unmatched (no image URL): {unmatched_count}") | |
| def main(): | |
| """Main function to orchestrate the CSV creation process.""" | |
| print("=" * 60) | |
| print("Creating CSV from JSON responses and HTML image links") | |
| print("=" * 60) | |
| # Step 1: Load and filter JSON files | |
| entries = load_json_files(UNKNOWN_DIR) | |
| if not entries: | |
| print("No valid entries found. Exiting.") | |
| return | |
| # Step 2: Extract image URLs from HTML | |
| image_urls = extract_image_urls_from_html(HTML_FILE) | |
| # Step 3: Create CSV file | |
| create_csv(entries, image_urls, OUTPUT_CSV) | |
| print("\n" + "=" * 60) | |
| print(f"Done! Output file: {OUTPUT_CSV}") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |