| import json |
| import datasets |
|
|
| |
| _DESCRIPTION = """ |
| TanaData is a custom dataset for instruction-response tasks. |
| """ |
|
|
| _CITATION = """ |
| @misc{tanadata2025, |
| title={TanaData Dataset}, |
| year={2025}, |
| note={Custom dataset hosted on Hugging Face} |
| } |
| """ |
|
|
| class TanaData(datasets.GeneratorBasedBuilder): |
| VERSION = datasets.Version("1.0.0") |
| |
| def _info(self): |
| return datasets.DatasetInfo( |
| description=_DESCRIPTION, |
| features=datasets.Features({ |
| "instruction": datasets.Value("string"), |
| "input": datasets.Value("string"), |
| "output": datasets.Value("string"), |
| }), |
| supervised_keys=None, |
| homepage="https://huggingface.co/mdevoz/tanadata", |
| citation=_CITATION, |
| ) |
| |
| def _split_generators(self, dl_manager): |
| |
| file_path = dl_manager.download_and_extract( |
| "https://huggingface.co/mdevoz/tanadata/resolve/main/tanadata.json" |
| ) |
| return [ |
| datasets.SplitGenerator( |
| name=datasets.Split.TRAIN, |
| gen_kwargs={"filepath": file_path} |
| ) |
| ] |
| |
| def _generate_examples(self, filepath): |
| |
| with open(filepath, encoding="utf-8") as f: |
| |
| data = json.load(f) |
| for idx, example in enumerate(data): |
| yield idx, example |
|
|
| |
| |
| |
| |
| |
|
|