DealSight-Intelligence / scripts /01_curate_lite.py
abhinavvathadi's picture
Initial commit for DealSight Intelligence
76562dd
"""CLI entry point for dataset preparation.
Three modes:
- default: build a fresh lite dataset from raw Amazon metadata.
- `--from-source --purpose structured`: download a structured `items_*`
dataset from Hugging Face (or a local folder) and write the splits.
- `--from-source --purpose prompt`: do the same for a prompt/completion
fine-tuning dataset.
"""
import sys
from pathlib import Path
import argparse
sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src"))
from dealsight_intelligence import config
from dealsight_intelligence.data.curate_lite import curate_lite, download_hub_dataset, export_prompt_dataset
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create or download datasets for DealSight Intelligence.")
parser.add_argument(
"--from-hub",
"--from-source",
action="store_true",
dest="from_source",
help="Load a Hugging Face dataset repo ID or local exported dataset folder",
)
parser.add_argument(
"--purpose",
choices=["structured", "prompt"],
default="structured",
help="structured is for app/vectorstore/RF; prompt is for prompt training/eval only",
)
parser.add_argument(
"--dataset",
default=None,
help="HF dataset repo ID or local dataset folder. Defaults to dealsight_intelligence_*_DATASET_SOURCE.",
)
parser.add_argument("--prefix", default=None, help="Output filename prefix, for example lite, full, prompts_lite")
parser.add_argument("--category", default="Appliances", help="Amazon metadata category to load")
parser.add_argument("--train-size", type=int, default=20000, help="Number of training items")
parser.add_argument("--test-size", type=int, default=2000, help="Number of test items")
args = parser.parse_args()
if args.from_source:
if args.purpose == "prompt":
prefix = args.prefix or "prompts_lite"
train_path, validation_path, test_path = export_prompt_dataset(dataset_name=args.dataset, prefix=prefix)
print(f"Wrote {train_path}")
print(f"Wrote {validation_path}")
if test_path:
print(f"Wrote {test_path}")
raise SystemExit(0)
prefix = args.prefix or config.dataset_prefix("lite")
train_path, validation_path, test_path = download_hub_dataset(dataset_name=args.dataset, prefix=prefix)
print(f"Wrote {train_path}")
print(f"Wrote {validation_path}")
print(f"Wrote {test_path}")
raise SystemExit(0)
train_path, test_path = curate_lite(
category=args.category,
train_size=args.train_size,
test_size=args.test_size,
)
print(f"Wrote {train_path}")
print(f"Wrote {test_path}")