import json import datasets # You can update these with more detailed information. _DESCRIPTION = """ TanaData is a custom dataset for instruction-response tasks. """ _CITATION = """ @misc{tanadata2025, title={TanaData Dataset}, year={2025}, note={Custom dataset hosted on Hugging Face} } """ class TanaData(datasets.GeneratorBasedBuilder): VERSION = datasets.Version("1.0.0") def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "instruction": datasets.Value("string"), "input": datasets.Value("string"), "output": datasets.Value("string"), }), supervised_keys=None, homepage="https://huggingface.co/mdevoz/tanadata", citation=_CITATION, ) def _split_generators(self, dl_manager): # This URL points to your JSON file in the repository. file_path = dl_manager.download_and_extract( "https://huggingface.co/mdevoz/tanadata/resolve/main/tanadata.json" ) return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path} ) ] def _generate_examples(self, filepath): # Adjust this logic based on your JSON file structure. with open(filepath, encoding="utf-8") as f: # If your file is a JSON array of examples: data = json.load(f) for idx, example in enumerate(data): yield idx, example # For testing, you can uncomment the following lines locally: # if __name__ == "__main__": # from datasets import load_dataset # dataset = load_dataset(__file__, name="tanadata") # print(dataset)