mdevoz
/

tanadata

Model card Files Files and versions

tanadata / tanadata.py

mdevoz's picture

Update tanadata.py

416fa7f verified 12 months ago

1.82 kB

	import json
	import datasets

	# You can update these with more detailed information.
	_DESCRIPTION = """
	TanaData is a custom dataset for instruction-response tasks.
	"""

	_CITATION = """
	@misc{tanadata2025,
	title={TanaData Dataset},
	year={2025},
	note={Custom dataset hosted on Hugging Face}
	}
	"""

	class TanaData(datasets.GeneratorBasedBuilder):
	VERSION = datasets.Version("1.0.0")

	def _info(self):
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=datasets.Features({
	"instruction": datasets.Value("string"),
	"input": datasets.Value("string"),
	"output": datasets.Value("string"),
	}),
	supervised_keys=None,
	homepage="https://huggingface.co/mdevoz/tanadata",
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	# This URL points to your JSON file in the repository.
	file_path = dl_manager.download_and_extract(
	"https://huggingface.co/mdevoz/tanadata/resolve/main/tanadata.json"
	)
	return [
	datasets.SplitGenerator(
	name=datasets.Split.TRAIN,
	gen_kwargs={"filepath": file_path}
	)
	]

	def _generate_examples(self, filepath):
	# Adjust this logic based on your JSON file structure.
	with open(filepath, encoding="utf-8") as f:
	# If your file is a JSON array of examples:
	data = json.load(f)
	for idx, example in enumerate(data):
	yield idx, example

	# For testing, you can uncomment the following lines locally:
	# if __name__ == "__main__":
	# from datasets import load_dataset
	# dataset = load_dataset(__file__, name="tanadata")
	# print(dataset)