lanternfly-field-capture / lanternfly.py
rlogh's picture
Upload 4 files
af6b90a verified
"""
Lanternfly Dataset Loading Script for Hugging Face Datasets
This script enables loading the lanternfly field capture dataset using:
from datasets import load_dataset
dataset = load_dataset("your-username/lanternfly-data")
"""
import json
from datasets import Dataset, Features, Image, Value, Sequence
from huggingface_hub import hf_hub_download
import os
from typing import Dict, List, Any
def load_lanternfly_dataset(repo_id: str, split: str = "train") -> Dataset:
"""
Load the lanternfly field capture dataset from Hugging Face Hub.
Args:
repo_id: The Hugging Face repository ID (e.g., "username/lanternfly-data")
split: Dataset split to load (default: "train")
Returns:
Dataset with image and metadata features
"""
# Define the dataset features
features = Features({
"image": Image(),
"metadata": {
"image_path": Value("string"),
"lat": Value("float64"),
"lon": Value("float64"),
"accuracy_m": Value("float64"),
"device_ts": Value("string"),
"server_ts": Value("string")
}
})
# Download the JSONL file
try:
jsonl_path = hf_hub_download(
repo_id=repo_id,
filename="data/records.jsonl",
repo_type="dataset"
)
except Exception as e:
raise FileNotFoundError(f"Could not find data/records.jsonl in {repo_id}. Error: {e}")
# Read and parse the JSONL file
records = []
with open(jsonl_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
records.append(record)
except json.JSONDecodeError as e:
print(f"Warning: Skipping invalid JSON on line {line_num}: {e}")
continue
if not records:
raise ValueError("No valid records found in the dataset")
# Prepare dataset examples
examples = []
for record in records:
try:
# Download the image
image_path = record["image_path"]
image_file = hf_hub_download(
repo_id=repo_id,
filename=image_path,
repo_type="dataset"
)
# Create example
example = {
"image": image_file,
"metadata": {
"image_path": record["image_path"],
"lat": record.get("lat"),
"lon": record.get("lon"),
"accuracy_m": record.get("accuracy_m"),
"device_ts": record.get("device_ts"),
"server_ts": record.get("server_ts")
}
}
examples.append(example)
except Exception as e:
print(f"Warning: Could not load image {record.get('image_path', 'unknown')}: {e}")
continue
if not examples:
raise ValueError("No valid examples could be loaded from the dataset")
# Create and return the dataset
dataset = Dataset.from_list(examples, features=features)
return dataset
def get_dataset_info(repo_id: str) -> Dict[str, Any]:
"""
Get basic information about the dataset without loading all images.
Args:
repo_id: The Hugging Face repository ID
Returns:
Dictionary with dataset statistics
"""
try:
jsonl_path = hf_hub_download(
repo_id=repo_id,
filename="data/records.jsonl",
repo_type="dataset"
)
except Exception as e:
return {"error": f"Could not access dataset: {e}"}
records = []
with open(jsonl_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line:
try:
record = json.loads(line)
records.append(record)
except json.JSONDecodeError:
continue
if not records:
return {"error": "No records found"}
# Calculate statistics
lats = [r.get("lat") for r in records if r.get("lat") is not None]
lons = [r.get("lon") for r in records if r.get("lon") is not None]
accuracies = [r.get("accuracy_m") for r in records if r.get("accuracy_m") is not None]
info = {
"total_records": len(records),
"records_with_gps": len([r for r in records if r.get("lat") is not None]),
"date_range": {
"earliest": min(r.get("server_ts", "") for r in records if r.get("server_ts")),
"latest": max(r.get("server_ts", "") for r in records if r.get("server_ts"))
}
}
if lats and lons:
info["location_bounds"] = {
"min_lat": min(lats),
"max_lat": max(lats),
"min_lon": min(lons),
"max_lon": max(lons)
}
if accuracies:
info["gps_accuracy"] = {
"min": min(accuracies),
"max": max(accuracies),
"avg": sum(accuracies) / len(accuracies)
}
return info
# Example usage
if __name__ == "__main__":
# Example: Load dataset
# dataset = load_lanternfly_dataset("your-username/lanternfly-data")
# print(f"Loaded {len(dataset)} examples")
# Example: Get dataset info
# info = get_dataset_info("your-username/lanternfly-data")
# print(f"Dataset info: {info}")
pass