"""CLI entry point for dataset preparation.

Three modes:
- default: build a fresh lite dataset from raw Amazon metadata.
- `--from-source --purpose structured`: download a structured `items_*`
  dataset from Hugging Face (or a local folder) and write the splits.
- `--from-source --purpose prompt`: do the same for a prompt/completion
  fine-tuning dataset.
"""

import sys
from pathlib import Path
import argparse

sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src"))

from dealsight_intelligence import config
from dealsight_intelligence.data.curate_lite import curate_lite, download_hub_dataset, export_prompt_dataset


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Create or download datasets for DealSight Intelligence.")
    parser.add_argument(
        "--from-hub",
        "--from-source",
        action="store_true",
        dest="from_source",
        help="Load a Hugging Face dataset repo ID or local exported dataset folder",
    )
    parser.add_argument(
        "--purpose",
        choices=["structured", "prompt"],
        default="structured",
        help="structured is for app/vectorstore/RF; prompt is for prompt training/eval only",
    )
    parser.add_argument(
        "--dataset",
        default=None,
        help="HF dataset repo ID or local dataset folder. Defaults to dealsight_intelligence_*_DATASET_SOURCE.",
    )
    parser.add_argument("--prefix", default=None, help="Output filename prefix, for example lite, full, prompts_lite")
    parser.add_argument("--category", default="Appliances", help="Amazon metadata category to load")
    parser.add_argument("--train-size", type=int, default=20000, help="Number of training items")
    parser.add_argument("--test-size", type=int, default=2000, help="Number of test items")
    args = parser.parse_args()
    if args.from_source:
        if args.purpose == "prompt":
            prefix = args.prefix or "prompts_lite"
            train_path, validation_path, test_path = export_prompt_dataset(dataset_name=args.dataset, prefix=prefix)
            print(f"Wrote {train_path}")
            print(f"Wrote {validation_path}")
            if test_path:
                print(f"Wrote {test_path}")
            raise SystemExit(0)
        prefix = args.prefix or config.dataset_prefix("lite")
        train_path, validation_path, test_path = download_hub_dataset(dataset_name=args.dataset, prefix=prefix)
        print(f"Wrote {train_path}")
        print(f"Wrote {validation_path}")
        print(f"Wrote {test_path}")
        raise SystemExit(0)
    train_path, test_path = curate_lite(
        category=args.category,
        train_size=args.train_size,
        test_size=args.test_size,
    )
    print(f"Wrote {train_path}")
    print(f"Wrote {test_path}")