Spaces:
Sleeping
Sleeping
| """ | |
| Lanternfly Dataset Loading Script for Hugging Face Datasets | |
| This script enables loading the lanternfly field capture dataset using: | |
| from datasets import load_dataset | |
| dataset = load_dataset("your-username/lanternfly-data") | |
| """ | |
| import json | |
| from datasets import Dataset, Features, Image, Value, Sequence | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| from typing import Dict, List, Any | |
| def load_lanternfly_dataset(repo_id: str, split: str = "train") -> Dataset: | |
| """ | |
| Load the lanternfly field capture dataset from Hugging Face Hub. | |
| Args: | |
| repo_id: The Hugging Face repository ID (e.g., "username/lanternfly-data") | |
| split: Dataset split to load (default: "train") | |
| Returns: | |
| Dataset with image and metadata features | |
| """ | |
| # Define the dataset features | |
| features = Features({ | |
| "image": Image(), | |
| "metadata": { | |
| "image_path": Value("string"), | |
| "lat": Value("float64"), | |
| "lon": Value("float64"), | |
| "accuracy_m": Value("float64"), | |
| "device_ts": Value("string"), | |
| "server_ts": Value("string") | |
| } | |
| }) | |
| # Download the JSONL file | |
| try: | |
| jsonl_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename="data/records.jsonl", | |
| repo_type="dataset" | |
| ) | |
| except Exception as e: | |
| raise FileNotFoundError(f"Could not find data/records.jsonl in {repo_id}. Error: {e}") | |
| # Read and parse the JSONL file | |
| records = [] | |
| with open(jsonl_path, 'r', encoding='utf-8') as f: | |
| for line_num, line in enumerate(f, 1): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| try: | |
| record = json.loads(line) | |
| records.append(record) | |
| except json.JSONDecodeError as e: | |
| print(f"Warning: Skipping invalid JSON on line {line_num}: {e}") | |
| continue | |
| if not records: | |
| raise ValueError("No valid records found in the dataset") | |
| # Prepare dataset examples | |
| examples = [] | |
| for record in records: | |
| try: | |
| # Download the image | |
| image_path = record["image_path"] | |
| image_file = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=image_path, | |
| repo_type="dataset" | |
| ) | |
| # Create example | |
| example = { | |
| "image": image_file, | |
| "metadata": { | |
| "image_path": record["image_path"], | |
| "lat": record.get("lat"), | |
| "lon": record.get("lon"), | |
| "accuracy_m": record.get("accuracy_m"), | |
| "device_ts": record.get("device_ts"), | |
| "server_ts": record.get("server_ts") | |
| } | |
| } | |
| examples.append(example) | |
| except Exception as e: | |
| print(f"Warning: Could not load image {record.get('image_path', 'unknown')}: {e}") | |
| continue | |
| if not examples: | |
| raise ValueError("No valid examples could be loaded from the dataset") | |
| # Create and return the dataset | |
| dataset = Dataset.from_list(examples, features=features) | |
| return dataset | |
| def get_dataset_info(repo_id: str) -> Dict[str, Any]: | |
| """ | |
| Get basic information about the dataset without loading all images. | |
| Args: | |
| repo_id: The Hugging Face repository ID | |
| Returns: | |
| Dictionary with dataset statistics | |
| """ | |
| try: | |
| jsonl_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename="data/records.jsonl", | |
| repo_type="dataset" | |
| ) | |
| except Exception as e: | |
| return {"error": f"Could not access dataset: {e}"} | |
| records = [] | |
| with open(jsonl_path, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| line = line.strip() | |
| if line: | |
| try: | |
| record = json.loads(line) | |
| records.append(record) | |
| except json.JSONDecodeError: | |
| continue | |
| if not records: | |
| return {"error": "No records found"} | |
| # Calculate statistics | |
| lats = [r.get("lat") for r in records if r.get("lat") is not None] | |
| lons = [r.get("lon") for r in records if r.get("lon") is not None] | |
| accuracies = [r.get("accuracy_m") for r in records if r.get("accuracy_m") is not None] | |
| info = { | |
| "total_records": len(records), | |
| "records_with_gps": len([r for r in records if r.get("lat") is not None]), | |
| "date_range": { | |
| "earliest": min(r.get("server_ts", "") for r in records if r.get("server_ts")), | |
| "latest": max(r.get("server_ts", "") for r in records if r.get("server_ts")) | |
| } | |
| } | |
| if lats and lons: | |
| info["location_bounds"] = { | |
| "min_lat": min(lats), | |
| "max_lat": max(lats), | |
| "min_lon": min(lons), | |
| "max_lon": max(lons) | |
| } | |
| if accuracies: | |
| info["gps_accuracy"] = { | |
| "min": min(accuracies), | |
| "max": max(accuracies), | |
| "avg": sum(accuracies) / len(accuracies) | |
| } | |
| return info | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Example: Load dataset | |
| # dataset = load_lanternfly_dataset("your-username/lanternfly-data") | |
| # print(f"Loaded {len(dataset)} examples") | |
| # Example: Get dataset info | |
| # info = get_dataset_info("your-username/lanternfly-data") | |
| # print(f"Dataset info: {info}") | |
| pass | |