Spaces:

rlogh
/

lanternfly-field-capture

Sleeping

App Files Files Community

lanternfly-field-capture / lanternfly.py

rlogh

Upload 4 files

af6b90a verified 3 months ago

raw

history blame contribute delete

5.8 kB

	"""
	Lanternfly Dataset Loading Script for Hugging Face Datasets

	This script enables loading the lanternfly field capture dataset using:
	from datasets import load_dataset
	dataset = load_dataset("your-username/lanternfly-data")
	"""

	import json
	from datasets import Dataset, Features, Image, Value, Sequence
	from huggingface_hub import hf_hub_download
	import os
	from typing import Dict, List, Any

	def load_lanternfly_dataset(repo_id: str, split: str = "train") -> Dataset:
	"""
	Load the lanternfly field capture dataset from Hugging Face Hub.

	Args:
	repo_id: The Hugging Face repository ID (e.g., "username/lanternfly-data")
	split: Dataset split to load (default: "train")

	Returns:
	Dataset with image and metadata features
	"""

	# Define the dataset features
	features = Features({
	"image": Image(),
	"metadata": {
	"image_path": Value("string"),
	"lat": Value("float64"),
	"lon": Value("float64"),
	"accuracy_m": Value("float64"),
	"device_ts": Value("string"),
	"server_ts": Value("string")
	}
	})

	# Download the JSONL file
	try:
	jsonl_path = hf_hub_download(
	repo_id=repo_id,
	filename="data/records.jsonl",
	repo_type="dataset"
	)
	except Exception as e:
	raise FileNotFoundError(f"Could not find data/records.jsonl in {repo_id}. Error: {e}")

	# Read and parse the JSONL file
	records = []
	with open(jsonl_path, 'r', encoding='utf-8') as f:
	for line_num, line in enumerate(f, 1):
	line = line.strip()
	if not line:
	continue
	try:
	record = json.loads(line)
	records.append(record)
	except json.JSONDecodeError as e:
	print(f"Warning: Skipping invalid JSON on line {line_num}: {e}")
	continue

	if not records:
	raise ValueError("No valid records found in the dataset")

	# Prepare dataset examples
	examples = []
	for record in records:
	try:
	# Download the image
	image_path = record["image_path"]
	image_file = hf_hub_download(
	repo_id=repo_id,
	filename=image_path,
	repo_type="dataset"
	)

	# Create example
	example = {
	"image": image_file,
	"metadata": {
	"image_path": record["image_path"],
	"lat": record.get("lat"),
	"lon": record.get("lon"),
	"accuracy_m": record.get("accuracy_m"),
	"device_ts": record.get("device_ts"),
	"server_ts": record.get("server_ts")
	}
	}
	examples.append(example)

	except Exception as e:
	print(f"Warning: Could not load image {record.get('image_path', 'unknown')}: {e}")
	continue

	if not examples:
	raise ValueError("No valid examples could be loaded from the dataset")

	# Create and return the dataset
	dataset = Dataset.from_list(examples, features=features)
	return dataset

	def get_dataset_info(repo_id: str) -> Dict[str, Any]:
	"""
	Get basic information about the dataset without loading all images.

	Args:
	repo_id: The Hugging Face repository ID

	Returns:
	Dictionary with dataset statistics
	"""
	try:
	jsonl_path = hf_hub_download(
	repo_id=repo_id,
	filename="data/records.jsonl",
	repo_type="dataset"
	)
	except Exception as e:
	return {"error": f"Could not access dataset: {e}"}

	records = []
	with open(jsonl_path, 'r', encoding='utf-8') as f:
	for line in f:
	line = line.strip()
	if line:
	try:
	record = json.loads(line)
	records.append(record)
	except json.JSONDecodeError:
	continue

	if not records:
	return {"error": "No records found"}

	# Calculate statistics
	lats = [r.get("lat") for r in records if r.get("lat") is not None]
	lons = [r.get("lon") for r in records if r.get("lon") is not None]
	accuracies = [r.get("accuracy_m") for r in records if r.get("accuracy_m") is not None]

	info = {
	"total_records": len(records),
	"records_with_gps": len([r for r in records if r.get("lat") is not None]),
	"date_range": {
	"earliest": min(r.get("server_ts", "") for r in records if r.get("server_ts")),
	"latest": max(r.get("server_ts", "") for r in records if r.get("server_ts"))
	}
	}

	if lats and lons:
	info["location_bounds"] = {
	"min_lat": min(lats),
	"max_lat": max(lats),
	"min_lon": min(lons),
	"max_lon": max(lons)
	}

	if accuracies:
	info["gps_accuracy"] = {
	"min": min(accuracies),
	"max": max(accuracies),
	"avg": sum(accuracies) / len(accuracies)
	}

	return info

	# Example usage
	if __name__ == "__main__":
	# Example: Load dataset
	# dataset = load_lanternfly_dataset("your-username/lanternfly-data")
	# print(f"Loaded {len(dataset)} examples")

	# Example: Get dataset info
	# info = get_dataset_info("your-username/lanternfly-data")
	# print(f"Dataset info: {info}")
	pass