codebook / potato /datasets_integration.py
davidjurgens's picture
Deploy: Potato — Codebook Annotation
aceb1b2 verified
Raw
History Blame Contribute Delete
3.43 kB
"""
HuggingFace Datasets Integration
Convenience API for loading Potato annotations as HuggingFace Datasets
or pandas DataFrames — no Hub round-trip required.
Requires: pip install datasets>=2.14.0
Usage:
from potato import load_as_dataset, load_annotations
# Load as HuggingFace DatasetDict
ds = load_as_dataset("path/to/config.yaml")
print(ds["annotations"][0])
# Load as pandas DataFrame
df = load_annotations("path/to/config.yaml")
print(df.head())
"""
import logging
from typing import Optional
logger = logging.getLogger(__name__)
def load_as_dataset(config_path: str,
include_spans: bool = True,
include_items: bool = True):
"""
Load Potato annotations as a HuggingFace DatasetDict.
Reads the config file, loads annotations from the output directory,
and returns an in-memory DatasetDict with up to three splits:
'annotations', 'spans', and 'items'.
Args:
config_path: Path to the Potato YAML config file
include_spans: Include a 'spans' split (default True)
include_items: Include an 'items' split (default True)
Returns:
datasets.DatasetDict with annotation data
Raises:
ImportError: If the 'datasets' package is not installed
FileNotFoundError: If config_path does not exist
ValueError: If no annotations are found
"""
try:
from datasets import DatasetDict # noqa: F401
except ImportError:
raise ImportError(
"The 'datasets' package is required for load_as_dataset(). "
"Install with: pip install datasets>=2.14.0"
)
from potato.export.cli import build_export_context
from potato.export.huggingface_exporter import HuggingFaceExporter
context = build_export_context(config_path)
exporter = HuggingFaceExporter()
return exporter.build_dataset_dict(
context,
include_spans=include_spans,
include_items=include_items,
)
def load_annotations(config_path: str):
"""
Load Potato annotations as a pandas DataFrame.
Reads the config file, loads annotations from the output directory,
and returns a flattened DataFrame with one row per (instance, user)
annotation pair.
Args:
config_path: Path to the Potato YAML config file
Returns:
pandas.DataFrame with columns: instance_id, user_id, and one
column per annotation schema
Raises:
FileNotFoundError: If config_path does not exist
ValueError: If no annotations are found
"""
import json
import pandas as pd
from potato.export.cli import build_export_context
context = build_export_context(config_path)
if not context.annotations:
raise ValueError(
f"No annotations found for config: {config_path}"
)
schema_map = {s["name"]: s for s in context.schemas}
rows = []
for ann in context.annotations:
row = {
"instance_id": ann.get("instance_id", ""),
"user_id": ann.get("user_id", ""),
}
labels = ann.get("labels", {})
for schema_name, value in labels.items():
if isinstance(value, (dict, list)):
row[schema_name] = json.dumps(value, ensure_ascii=False)
else:
row[schema_name] = value
rows.append(row)
return pd.DataFrame(rows)