Update tanadata.py
Browse files- tanadata.py +57 -57
tanadata.py
CHANGED
|
@@ -1,57 +1,57 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import datasets
|
| 3 |
-
|
| 4 |
-
# You can update these with more detailed information.
|
| 5 |
-
_DESCRIPTION = """
|
| 6 |
-
TanaData is a custom dataset for instruction-response tasks.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
_CITATION = """
|
| 10 |
-
@misc{tanadata2025,
|
| 11 |
-
title={TanaData Dataset},
|
| 12 |
-
year={2025},
|
| 13 |
-
note={Custom dataset hosted on Hugging Face}
|
| 14 |
-
}
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
class TanaData(datasets.GeneratorBasedBuilder):
|
| 18 |
-
VERSION = datasets.Version("1.0.0")
|
| 19 |
-
|
| 20 |
-
def _info(self):
|
| 21 |
-
return datasets.DatasetInfo(
|
| 22 |
-
description=_DESCRIPTION,
|
| 23 |
-
features=datasets.Features({
|
| 24 |
-
"instruction": datasets.Value("string"),
|
| 25 |
-
"input": datasets.Value("string"),
|
| 26 |
-
"output": datasets.Value("string"),
|
| 27 |
-
}),
|
| 28 |
-
supervised_keys=None,
|
| 29 |
-
homepage="https://huggingface.co/mdevoz/tanadata",
|
| 30 |
-
citation=_CITATION,
|
| 31 |
-
)
|
| 32 |
-
|
| 33 |
-
def _split_generators(self, dl_manager):
|
| 34 |
-
# This URL points to your JSON file in the repository.
|
| 35 |
-
file_path = dl_manager.download_and_extract(
|
| 36 |
-
"https://huggingface.co/mdevoz/tanadata/resolve/main/
|
| 37 |
-
)
|
| 38 |
-
return [
|
| 39 |
-
datasets.SplitGenerator(
|
| 40 |
-
name=datasets.Split.TRAIN,
|
| 41 |
-
gen_kwargs={"filepath": file_path}
|
| 42 |
-
)
|
| 43 |
-
]
|
| 44 |
-
|
| 45 |
-
def _generate_examples(self, filepath):
|
| 46 |
-
# Adjust this logic based on your JSON file structure.
|
| 47 |
-
with open(filepath, encoding="utf-8") as f:
|
| 48 |
-
# If your file is a JSON array of examples:
|
| 49 |
-
data = json.load(f)
|
| 50 |
-
for idx, example in enumerate(data):
|
| 51 |
-
yield idx, example
|
| 52 |
-
|
| 53 |
-
# For testing, you can uncomment the following lines locally:
|
| 54 |
-
# if __name__ == "__main__":
|
| 55 |
-
# from datasets import load_dataset
|
| 56 |
-
# dataset = load_dataset(__file__, name="tanadata")
|
| 57 |
-
# print(dataset)
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import datasets
|
| 3 |
+
|
| 4 |
+
# You can update these with more detailed information.
|
| 5 |
+
_DESCRIPTION = """
|
| 6 |
+
TanaData is a custom dataset for instruction-response tasks.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
_CITATION = """
|
| 10 |
+
@misc{tanadata2025,
|
| 11 |
+
title={TanaData Dataset},
|
| 12 |
+
year={2025},
|
| 13 |
+
note={Custom dataset hosted on Hugging Face}
|
| 14 |
+
}
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
class TanaData(datasets.GeneratorBasedBuilder):
|
| 18 |
+
VERSION = datasets.Version("1.0.0")
|
| 19 |
+
|
| 20 |
+
def _info(self):
|
| 21 |
+
return datasets.DatasetInfo(
|
| 22 |
+
description=_DESCRIPTION,
|
| 23 |
+
features=datasets.Features({
|
| 24 |
+
"instruction": datasets.Value("string"),
|
| 25 |
+
"input": datasets.Value("string"),
|
| 26 |
+
"output": datasets.Value("string"),
|
| 27 |
+
}),
|
| 28 |
+
supervised_keys=None,
|
| 29 |
+
homepage="https://huggingface.co/mdevoz/tanadata",
|
| 30 |
+
citation=_CITATION,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
def _split_generators(self, dl_manager):
|
| 34 |
+
# This URL points to your JSON file in the repository.
|
| 35 |
+
file_path = dl_manager.download_and_extract(
|
| 36 |
+
"https://huggingface.co/mdevoz/tanadata/resolve/main/tanadata.json"
|
| 37 |
+
)
|
| 38 |
+
return [
|
| 39 |
+
datasets.SplitGenerator(
|
| 40 |
+
name=datasets.Split.TRAIN,
|
| 41 |
+
gen_kwargs={"filepath": file_path}
|
| 42 |
+
)
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
def _generate_examples(self, filepath):
|
| 46 |
+
# Adjust this logic based on your JSON file structure.
|
| 47 |
+
with open(filepath, encoding="utf-8") as f:
|
| 48 |
+
# If your file is a JSON array of examples:
|
| 49 |
+
data = json.load(f)
|
| 50 |
+
for idx, example in enumerate(data):
|
| 51 |
+
yield idx, example
|
| 52 |
+
|
| 53 |
+
# For testing, you can uncomment the following lines locally:
|
| 54 |
+
# if __name__ == "__main__":
|
| 55 |
+
# from datasets import load_dataset
|
| 56 |
+
# dataset = load_dataset(__file__, name="tanadata")
|
| 57 |
+
# print(dataset)
|