codebook / potato /adjudication_export.py
davidjurgens's picture
Deploy: Potato — Codebook Annotation
aceb1b2 verified
Raw
History Blame Contribute Delete
5.07 kB
"""
Adjudication Export CLI
Generate final datasets by merging unanimous agreements and adjudication decisions.
Usage:
python -m potato.adjudication_export --config config.yaml --output final_dataset.jsonl
python -m potato.adjudication_export --config config.yaml --output final.csv --format csv
python -m potato.adjudication_export --config config.yaml --output final.json --format json
"""
import argparse
import csv
import json
import os
import sys
import logging
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser(
description="Export adjudicated dataset from Potato annotation project"
)
parser.add_argument(
"--config", required=True,
help="Path to the Potato config YAML file"
)
parser.add_argument(
"--output", required=True,
help="Output file path"
)
parser.add_argument(
"--format", choices=["jsonl", "json", "csv"], default="jsonl",
help="Output format (default: jsonl)"
)
parser.add_argument(
"--include-unresolved", action="store_true",
help="Include items without adjudication or consensus"
)
parser.add_argument(
"--verbose", "-v", action="store_true",
help="Verbose output"
)
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
# Load config
from potato.server_utils.config_module import init_config, config
try:
init_config(args.config)
except Exception as e:
print(f"Error loading config: {e}", file=sys.stderr)
sys.exit(1)
# Initialize state managers
from potato.item_state_management import init_item_state_manager
from potato.user_state_management import init_user_state_manager
init_user_state_manager(config)
init_item_state_manager(config)
# Load data (this loads items and user annotations from disk)
# We need a minimal load - just items and user states
from potato.flask_server import load_instance_data, load_user_data
load_instance_data(config)
load_user_data(config)
# Initialize adjudication manager
from potato.adjudication import init_adjudication_manager
adj_mgr = init_adjudication_manager(config)
if not adj_mgr or not adj_mgr.adj_config.enabled:
print("Adjudication is not enabled in this config.", file=sys.stderr)
sys.exit(1)
# Build queue to compute agreements
adj_mgr.build_queue()
# Generate final dataset
results = adj_mgr.generate_final_dataset()
# Filter unresolved if not requested
if not args.include_unresolved:
results = [r for r in results if r.get("source") != "unresolved"]
# Write output
output_path = args.output
fmt = args.format
if fmt == "jsonl":
with open(output_path, "w") as f:
for item in results:
f.write(json.dumps(item) + "\n")
elif fmt == "json":
with open(output_path, "w") as f:
json.dump(results, f, indent=2)
elif fmt == "csv":
if not results:
print("No results to export.", file=sys.stderr)
sys.exit(0)
# Flatten for CSV
fieldnames = set()
flat_results = []
for item in results:
flat = {
"instance_id": item["instance_id"],
"source": item.get("source", ""),
}
# Flatten labels
labels = item.get("labels", {})
for schema, value in labels.items():
if isinstance(value, dict):
flat[schema] = json.dumps(value)
else:
flat[schema] = value
# Add provenance fields
if "adjudicator" in item:
flat["adjudicator"] = item["adjudicator"]
if "confidence" in item:
flat["confidence"] = item["confidence"]
if "num_annotators" in item:
flat["num_annotators"] = item["num_annotators"]
fieldnames.update(flat.keys())
flat_results.append(flat)
# Sort fieldnames for consistent output
fieldnames = sorted(fieldnames)
with open(output_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
writer.writeheader()
writer.writerows(flat_results)
# Summary
total = len(results)
unanimous = sum(1 for r in results if r.get("source") == "unanimous")
adjudicated = sum(1 for r in results if r.get("source") == "adjudicated")
unresolved = sum(1 for r in results if r.get("source") == "unresolved")
print(f"\nExport complete: {output_path}")
print(f" Total items: {total}")
print(f" Unanimous: {unanimous}")
print(f" Adjudicated: {adjudicated}")
if args.include_unresolved:
print(f" Unresolved: {unresolved}")
print(f" Format: {fmt}")
if __name__ == "__main__":
main()