number_reading / - /number_reading.py
Huayang
Upload folder using huggingface_hub
313e0d2 verified
import os
import datasets
_CITATION = """\
# (Optional) Add your citation here
"""
_DESCRIPTION = """\
Number Reading
"""
LANGUAGES = [
"english"
]
class NumberReadingConfig(datasets.BuilderConfig):
def __init__(self, task_name, **kwargs):
super().__init__(name=task_name, **kwargs)
self.task_name = task_name
class NumberReading(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [
NumberReadingConfig(task_name=lang) for lang in LANGUAGES
]
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features({
"golden_reading": datasets.Sequence(datasets.Value("string")),
"prompt": datasets.Value("string"),
"number": datasets.Value("string"),
"language": datasets.Value("string"),
# add more fields depending on your JSONL schema
}),
supervised_keys=None,
homepage="https://huggingface.co/datasets/huayangli/`nlgraph`",
citation=_CITATION,
)
def _split_generators(self, dl_manager):
# task_dir = self.config.data_dir or "."
task_dir = "/home/huayang_sakana_ai/workspace/FSAugmentation/hf_data"
lang = self.config.name
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"filepath": os.path.join(task_dir, f"{lang}.jsonl")},
)
]
def _generate_examples(self, filepath):
import json
with open(filepath, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
data = json.loads(line)
data.pop("training")
yield idx, data