#!/usr/bin/env python3 """ Standalone CLI to prepare an ACE-Step dataset from a Hugging Face dataset. Only requires: pip install datasets Usage: python prepare_from_hf_cli.py --dataset --output-dir [options] Example: python prepare_from_hf_cli.py --dataset polyai/minds14 --output-dir ./data/minds14 --split train Then preprocess and train: python train.py preprocess --dataset-json ./data/minds14/dataset.json --tensor-output ./pt_minds14 ... python train.py fixed --dataset-dir ./pt_minds14 ... """ from __future__ import annotations import argparse import sys def main() -> int: parser = argparse.ArgumentParser( description="Prepare ACE-Step dataset from a Hugging Face dataset (dataset.json + audio/)", ) parser.add_argument("--dataset", required=True, metavar="NAME", help="Hugging Face dataset id") parser.add_argument("--output-dir", required=True, metavar="DIR", help="Output directory for dataset.json and audio/") parser.add_argument("--split", default="train", help="Dataset split (default: train)") parser.add_argument("--config", default=None, help="Dataset config name if required") parser.add_argument("--caption-column", default=None, help="Caption column (default: auto-detect)") parser.add_argument("--audio-column", default=None, help="Audio column (default: auto-detect)") parser.add_argument("--max-samples", type=int, default=None, help="Max samples to export (default: all)") parser.add_argument("--audio-subdir", default="audio", help="Audio subdir under output-dir (default: audio)") parser.add_argument("--json-filename", default="dataset.json", help="Output JSON filename (default: dataset.json)") parser.add_argument("--trust-remote-code", action="store_true", help="Allow datasets with custom code") args = parser.parse_args() from acestep.training_v2.prepare_from_hf import prepare_from_hf try: result = prepare_from_hf( dataset_name=args.dataset, output_dir=args.output_dir, split=args.split, config=args.config, caption_column=args.caption_column, audio_column=args.audio_column, max_samples=args.max_samples, audio_subdir=args.audio_subdir, json_filename=args.json_filename, trust_remote_code=args.trust_remote_code, ) except ImportError as e: print(f"[FAIL] {e}", file=sys.stderr) return 1 except Exception as e: print(f"[FAIL] {e}", file=sys.stderr) return 1 print(f"\n[OK] Prepared {result['num_samples']} samples") print(f" dataset_json: {result['dataset_json']}") print(f" audio_dir: {result['audio_dir']}") print("\nNext: preprocess then train (see train.py preprocess / train.py fixed).") return 0 if __name__ == "__main__": sys.exit(main())