""" Lanternfly Dataset Loading Script for Hugging Face Datasets This script enables loading the lanternfly field capture dataset using: from datasets import load_dataset dataset = load_dataset("your-username/lanternfly-data") """ import json from datasets import Dataset, Features, Image, Value, Sequence from huggingface_hub import hf_hub_download import os from typing import Dict, List, Any def load_lanternfly_dataset(repo_id: str, split: str = "train") -> Dataset: """ Load the lanternfly field capture dataset from Hugging Face Hub. Args: repo_id: The Hugging Face repository ID (e.g., "username/lanternfly-data") split: Dataset split to load (default: "train") Returns: Dataset with image and metadata features """ # Define the dataset features features = Features({ "image": Image(), "metadata": { "image_path": Value("string"), "lat": Value("float64"), "lon": Value("float64"), "accuracy_m": Value("float64"), "device_ts": Value("string"), "server_ts": Value("string") } }) # Download the JSONL file try: jsonl_path = hf_hub_download( repo_id=repo_id, filename="data/records.jsonl", repo_type="dataset" ) except Exception as e: raise FileNotFoundError(f"Could not find data/records.jsonl in {repo_id}. Error: {e}") # Read and parse the JSONL file records = [] with open(jsonl_path, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: continue try: record = json.loads(line) records.append(record) except json.JSONDecodeError as e: print(f"Warning: Skipping invalid JSON on line {line_num}: {e}") continue if not records: raise ValueError("No valid records found in the dataset") # Prepare dataset examples examples = [] for record in records: try: # Download the image image_path = record["image_path"] image_file = hf_hub_download( repo_id=repo_id, filename=image_path, repo_type="dataset" ) # Create example example = { "image": image_file, "metadata": { "image_path": record["image_path"], "lat": record.get("lat"), "lon": record.get("lon"), "accuracy_m": record.get("accuracy_m"), "device_ts": record.get("device_ts"), "server_ts": record.get("server_ts") } } examples.append(example) except Exception as e: print(f"Warning: Could not load image {record.get('image_path', 'unknown')}: {e}") continue if not examples: raise ValueError("No valid examples could be loaded from the dataset") # Create and return the dataset dataset = Dataset.from_list(examples, features=features) return dataset def get_dataset_info(repo_id: str) -> Dict[str, Any]: """ Get basic information about the dataset without loading all images. Args: repo_id: The Hugging Face repository ID Returns: Dictionary with dataset statistics """ try: jsonl_path = hf_hub_download( repo_id=repo_id, filename="data/records.jsonl", repo_type="dataset" ) except Exception as e: return {"error": f"Could not access dataset: {e}"} records = [] with open(jsonl_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: try: record = json.loads(line) records.append(record) except json.JSONDecodeError: continue if not records: return {"error": "No records found"} # Calculate statistics lats = [r.get("lat") for r in records if r.get("lat") is not None] lons = [r.get("lon") for r in records if r.get("lon") is not None] accuracies = [r.get("accuracy_m") for r in records if r.get("accuracy_m") is not None] info = { "total_records": len(records), "records_with_gps": len([r for r in records if r.get("lat") is not None]), "date_range": { "earliest": min(r.get("server_ts", "") for r in records if r.get("server_ts")), "latest": max(r.get("server_ts", "") for r in records if r.get("server_ts")) } } if lats and lons: info["location_bounds"] = { "min_lat": min(lats), "max_lat": max(lats), "min_lon": min(lons), "max_lon": max(lons) } if accuracies: info["gps_accuracy"] = { "min": min(accuracies), "max": max(accuracies), "avg": sum(accuracies) / len(accuracies) } return info # Example usage if __name__ == "__main__": # Example: Load dataset # dataset = load_lanternfly_dataset("your-username/lanternfly-data") # print(f"Loaded {len(dataset)} examples") # Example: Get dataset info # info = get_dataset_info("your-username/lanternfly-data") # print(f"Dataset info: {info}") pass