mdevoz commited on
Commit
9b36a83
·
verified ·
1 Parent(s): ad1860e

Upload tanadata.py

Browse files
Files changed (1) hide show
  1. tanadata.py +57 -0
tanadata.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import datasets
3
+
4
+ # You can update these with more detailed information.
5
+ _DESCRIPTION = """
6
+ TanaData is a custom dataset for instruction-response tasks.
7
+ """
8
+
9
+ _CITATION = """
10
+ @misc{tanadata2025,
11
+ title={TanaData Dataset},
12
+ year={2025},
13
+ note={Custom dataset hosted on Hugging Face}
14
+ }
15
+ """
16
+
17
+ class TanaData(datasets.GeneratorBasedBuilder):
18
+ VERSION = datasets.Version("1.0.0")
19
+
20
+ def _info(self):
21
+ return datasets.DatasetInfo(
22
+ description=_DESCRIPTION,
23
+ features=datasets.Features({
24
+ "instruction": datasets.Value("string"),
25
+ "input": datasets.Value("string"),
26
+ "output": datasets.Value("string"),
27
+ }),
28
+ supervised_keys=None,
29
+ homepage="https://huggingface.co/mdevoz/tanadata",
30
+ citation=_CITATION,
31
+ )
32
+
33
+ def _split_generators(self, dl_manager):
34
+ # This URL points to your JSON file in the repository.
35
+ file_path = dl_manager.download_and_extract(
36
+ "https://huggingface.co/mdevoz/tanadata/resolve/main/tana_z.json"
37
+ )
38
+ return [
39
+ datasets.SplitGenerator(
40
+ name=datasets.Split.TRAIN,
41
+ gen_kwargs={"filepath": file_path}
42
+ )
43
+ ]
44
+
45
+ def _generate_examples(self, filepath):
46
+ # Adjust this logic based on your JSON file structure.
47
+ with open(filepath, encoding="utf-8") as f:
48
+ # If your file is a JSON array of examples:
49
+ data = json.load(f)
50
+ for idx, example in enumerate(data):
51
+ yield idx, example
52
+
53
+ # For testing, you can uncomment the following lines locally:
54
+ # if __name__ == "__main__":
55
+ # from datasets import load_dataset
56
+ # dataset = load_dataset(__file__, name="tanadata")
57
+ # print(dataset)