Spaces:

abhinavvathadi
/

DealSight-Intelligence

Running

App Files Files Community

DealSight-Intelligence / scripts /01_curate_lite.py

abhinavvathadi

Initial commit for DealSight Intelligence

76562dd about 1 month ago

raw

history blame contribute delete

2.79 kB

	"""CLI entry point for dataset preparation.

	Three modes:
	- default: build a fresh lite dataset from raw Amazon metadata.
	- `--from-source --purpose structured`: download a structured `items_*`
	dataset from Hugging Face (or a local folder) and write the splits.
	- `--from-source --purpose prompt`: do the same for a prompt/completion
	fine-tuning dataset.
	"""

	import sys
	from pathlib import Path
	import argparse

	sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src"))

	from dealsight_intelligence import config
	from dealsight_intelligence.data.curate_lite import curate_lite, download_hub_dataset, export_prompt_dataset


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Create or download datasets for DealSight Intelligence.")
	parser.add_argument(
	"--from-hub",
	"--from-source",
	action="store_true",
	dest="from_source",
	help="Load a Hugging Face dataset repo ID or local exported dataset folder",
	)
	parser.add_argument(
	"--purpose",
	choices=["structured", "prompt"],
	default="structured",
	help="structured is for app/vectorstore/RF; prompt is for prompt training/eval only",
	)
	parser.add_argument(
	"--dataset",
	default=None,
	help="HF dataset repo ID or local dataset folder. Defaults to dealsight_intelligence_*_DATASET_SOURCE.",
	)
	parser.add_argument("--prefix", default=None, help="Output filename prefix, for example lite, full, prompts_lite")
	parser.add_argument("--category", default="Appliances", help="Amazon metadata category to load")
	parser.add_argument("--train-size", type=int, default=20000, help="Number of training items")
	parser.add_argument("--test-size", type=int, default=2000, help="Number of test items")
	args = parser.parse_args()
	if args.from_source:
	if args.purpose == "prompt":
	prefix = args.prefix or "prompts_lite"
	train_path, validation_path, test_path = export_prompt_dataset(dataset_name=args.dataset, prefix=prefix)
	print(f"Wrote {train_path}")
	print(f"Wrote {validation_path}")
	if test_path:
	print(f"Wrote {test_path}")
	raise SystemExit(0)
	prefix = args.prefix or config.dataset_prefix("lite")
	train_path, validation_path, test_path = download_hub_dataset(dataset_name=args.dataset, prefix=prefix)
	print(f"Wrote {train_path}")
	print(f"Wrote {validation_path}")
	print(f"Wrote {test_path}")
	raise SystemExit(0)
	train_path, test_path = curate_lite(
	category=args.category,
	train_size=args.train_size,
	test_size=args.test_size,
	)
	print(f"Wrote {train_path}")
	print(f"Wrote {test_path}")