Spaces:

Blablablab
/

codebook

Paused

App Files Files Community

codebook / potato /datasets_integration.py

davidjurgens

Deploy: Potato — Codebook Annotation

aceb1b2 verified 7 days ago

Raw

History Blame Contribute Delete

3.43 kB

	"""
	HuggingFace Datasets Integration

	Convenience API for loading Potato annotations as HuggingFace Datasets
	or pandas DataFrames — no Hub round-trip required.

	Requires: pip install datasets>=2.14.0

	Usage:
	from potato import load_as_dataset, load_annotations

	# Load as HuggingFace DatasetDict
	ds = load_as_dataset("path/to/config.yaml")
	print(ds["annotations"][0])

	# Load as pandas DataFrame
	df = load_annotations("path/to/config.yaml")
	print(df.head())
	"""

	import logging
	from typing import Optional

	logger = logging.getLogger(__name__)


	def load_as_dataset(config_path: str,
	include_spans: bool = True,
	include_items: bool = True):
	"""
	Load Potato annotations as a HuggingFace DatasetDict.

	Reads the config file, loads annotations from the output directory,
	and returns an in-memory DatasetDict with up to three splits:
	'annotations', 'spans', and 'items'.

	Args:
	config_path: Path to the Potato YAML config file
	include_spans: Include a 'spans' split (default True)
	include_items: Include an 'items' split (default True)

	Returns:
	datasets.DatasetDict with annotation data

	Raises:
	ImportError: If the 'datasets' package is not installed
	FileNotFoundError: If config_path does not exist
	ValueError: If no annotations are found
	"""
	try:
	from datasets import DatasetDict # noqa: F401
	except ImportError:
	raise ImportError(
	"The 'datasets' package is required for load_as_dataset(). "
	"Install with: pip install datasets>=2.14.0"
	)

	from potato.export.cli import build_export_context
	from potato.export.huggingface_exporter import HuggingFaceExporter

	context = build_export_context(config_path)
	exporter = HuggingFaceExporter()

	return exporter.build_dataset_dict(
	context,
	include_spans=include_spans,
	include_items=include_items,
	)


	def load_annotations(config_path: str):
	"""
	Load Potato annotations as a pandas DataFrame.

	Reads the config file, loads annotations from the output directory,
	and returns a flattened DataFrame with one row per (instance, user)
	annotation pair.

	Args:
	config_path: Path to the Potato YAML config file

	Returns:
	pandas.DataFrame with columns: instance_id, user_id, and one
	column per annotation schema

	Raises:
	FileNotFoundError: If config_path does not exist
	ValueError: If no annotations are found
	"""
	import json
	import pandas as pd

	from potato.export.cli import build_export_context

	context = build_export_context(config_path)

	if not context.annotations:
	raise ValueError(
	f"No annotations found for config: {config_path}"
	)

	schema_map = {s["name"]: s for s in context.schemas}
	rows = []
	for ann in context.annotations:
	row = {
	"instance_id": ann.get("instance_id", ""),
	"user_id": ann.get("user_id", ""),
	}
	labels = ann.get("labels", {})
	for schema_name, value in labels.items():
	if isinstance(value, (dict, list)):
	row[schema_name] = json.dumps(value, ensure_ascii=False)
	else:
	row[schema_name] = value
	rows.append(row)

	return pd.DataFrame(rows)