| """CLI entry point for dataset preparation. |
| |
| Three modes: |
| - default: build a fresh lite dataset from raw Amazon metadata. |
| - `--from-source --purpose structured`: download a structured `items_*` |
| dataset from Hugging Face (or a local folder) and write the splits. |
| - `--from-source --purpose prompt`: do the same for a prompt/completion |
| fine-tuning dataset. |
| """ |
|
|
| import sys |
| from pathlib import Path |
| import argparse |
|
|
| sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src")) |
|
|
| from dealsight_intelligence import config |
| from dealsight_intelligence.data.curate_lite import curate_lite, download_hub_dataset, export_prompt_dataset |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Create or download datasets for DealSight Intelligence.") |
| parser.add_argument( |
| "--from-hub", |
| "--from-source", |
| action="store_true", |
| dest="from_source", |
| help="Load a Hugging Face dataset repo ID or local exported dataset folder", |
| ) |
| parser.add_argument( |
| "--purpose", |
| choices=["structured", "prompt"], |
| default="structured", |
| help="structured is for app/vectorstore/RF; prompt is for prompt training/eval only", |
| ) |
| parser.add_argument( |
| "--dataset", |
| default=None, |
| help="HF dataset repo ID or local dataset folder. Defaults to dealsight_intelligence_*_DATASET_SOURCE.", |
| ) |
| parser.add_argument("--prefix", default=None, help="Output filename prefix, for example lite, full, prompts_lite") |
| parser.add_argument("--category", default="Appliances", help="Amazon metadata category to load") |
| parser.add_argument("--train-size", type=int, default=20000, help="Number of training items") |
| parser.add_argument("--test-size", type=int, default=2000, help="Number of test items") |
| args = parser.parse_args() |
| if args.from_source: |
| if args.purpose == "prompt": |
| prefix = args.prefix or "prompts_lite" |
| train_path, validation_path, test_path = export_prompt_dataset(dataset_name=args.dataset, prefix=prefix) |
| print(f"Wrote {train_path}") |
| print(f"Wrote {validation_path}") |
| if test_path: |
| print(f"Wrote {test_path}") |
| raise SystemExit(0) |
| prefix = args.prefix or config.dataset_prefix("lite") |
| train_path, validation_path, test_path = download_hub_dataset(dataset_name=args.dataset, prefix=prefix) |
| print(f"Wrote {train_path}") |
| print(f"Wrote {validation_path}") |
| print(f"Wrote {test_path}") |
| raise SystemExit(0) |
| train_path, test_path = curate_lite( |
| category=args.category, |
| train_size=args.train_size, |
| test_size=args.test_size, |
| ) |
| print(f"Wrote {train_path}") |
| print(f"Wrote {test_path}") |
|
|