Ace-Step-Munk / prepare_from_hf_cli.py
OnyxlMunkey's picture
Add Song Describer pipeline: prepare_song_describer.py, prepare_from_hf, preprocess/train CLI
3cdb1cf
#!/usr/bin/env python3
"""
Standalone CLI to prepare an ACE-Step dataset from a Hugging Face dataset.
Only requires: pip install datasets
Usage:
python prepare_from_hf_cli.py --dataset <HF_DATASET_ID> --output-dir <DIR> [options]
Example:
python prepare_from_hf_cli.py --dataset polyai/minds14 --output-dir ./data/minds14 --split train
Then preprocess and train:
python train.py preprocess --dataset-json ./data/minds14/dataset.json --tensor-output ./pt_minds14 ...
python train.py fixed --dataset-dir ./pt_minds14 ...
"""
from __future__ import annotations
import argparse
import sys
def main() -> int:
parser = argparse.ArgumentParser(
description="Prepare ACE-Step dataset from a Hugging Face dataset (dataset.json + audio/)",
)
parser.add_argument("--dataset", required=True, metavar="NAME", help="Hugging Face dataset id")
parser.add_argument("--output-dir", required=True, metavar="DIR", help="Output directory for dataset.json and audio/")
parser.add_argument("--split", default="train", help="Dataset split (default: train)")
parser.add_argument("--config", default=None, help="Dataset config name if required")
parser.add_argument("--caption-column", default=None, help="Caption column (default: auto-detect)")
parser.add_argument("--audio-column", default=None, help="Audio column (default: auto-detect)")
parser.add_argument("--max-samples", type=int, default=None, help="Max samples to export (default: all)")
parser.add_argument("--audio-subdir", default="audio", help="Audio subdir under output-dir (default: audio)")
parser.add_argument("--json-filename", default="dataset.json", help="Output JSON filename (default: dataset.json)")
parser.add_argument("--trust-remote-code", action="store_true", help="Allow datasets with custom code")
args = parser.parse_args()
from acestep.training_v2.prepare_from_hf import prepare_from_hf
try:
result = prepare_from_hf(
dataset_name=args.dataset,
output_dir=args.output_dir,
split=args.split,
config=args.config,
caption_column=args.caption_column,
audio_column=args.audio_column,
max_samples=args.max_samples,
audio_subdir=args.audio_subdir,
json_filename=args.json_filename,
trust_remote_code=args.trust_remote_code,
)
except ImportError as e:
print(f"[FAIL] {e}", file=sys.stderr)
return 1
except Exception as e:
print(f"[FAIL] {e}", file=sys.stderr)
return 1
print(f"\n[OK] Prepared {result['num_samples']} samples")
print(f" dataset_json: {result['dataset_json']}")
print(f" audio_dir: {result['audio_dir']}")
print("\nNext: preprocess then train (see train.py preprocess / train.py fixed).")
return 0
if __name__ == "__main__":
sys.exit(main())