File size: 5,365 Bytes
b0c0df0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import datasets
from PIL import Image
# Find for instance the citation on arxiv or on the dataset repo/website
_CITATION = """\
@inproceedings{masry-etal-2022-chartqa,
title = "{C}hart{QA}: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning",
author = "Masry, Ahmed and
Long, Do and
Tan, Jia Qing and
Joty, Shafiq and
Hoque, Enamul",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.findings-acl.177",
doi = "10.18653/v1/2022.findings-acl.177",
pages = "2263--2279",
}
"""
_DESCRIPTION = "A largescale benchmark covering 9.6K human-written questions as well as 23.1K questions generated from human-written chart summaries."
def get_builder_config(VERSION):
builder_config = [
datasets.BuilderConfig(
name=f"ChartQA",
version=VERSION,
description=f"ChartQA",
)
]
return builder_config
dataset_features = {
"type": datasets.Value("string"),
"question": datasets.Value("string"),
"answer": datasets.Value("string"),
"image": datasets.Image(),
}
class ChartQA(datasets.GeneratorBasedBuilder):
VERSION = datasets.Version("1.0.0")
BUILDER_CONFIGS = get_builder_config(VERSION)
def _info(self):
features = datasets.Features(dataset_features)
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# This defines the different columns of the dataset and their types
features=features, # Here we define them above because they are different between the two configurations
# If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
# specify them. They'll be used if as_supervised=True in builder.as_dataset.
# supervised_keys=("sentence", "label"),
# Homepage of the dataset for documentation
# Citation for the dataset
citation=_CITATION,
)
def _split_generators(self, dl_manager):
# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
# dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLS
# It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
# By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
image_path = "/home/yhzhang/ChartQA/ChartQA Dataset/test/png/"
human_annotation_path = "/home/yhzhang/ChartQA/ChartQA Dataset/test/test_human.json"
augmented_annotation_path = "/home/yhzhang/ChartQA/ChartQA Dataset/test/test_augmented.json"
return [
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"human_annotation": human_annotation_path,
"augmented_annotation": augmented_annotation_path,
"images": image_path,
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, human_annotation, augmented_annotation, images):
# The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
with open(human_annotation, encoding="utf-8") as f:
human_data = json.load(f)
with open(augmented_annotation, encoding="utf-8") as f:
augmented_data = json.load(f)
index = -1
for data in [human_data, augmented_data]:
for row in data["data"]:
index += 1
image_path = os.path.join(images, row["imgname"])
now_data = {}
now_data["type"] = "human_test" if data == human_data else "augmented_test"
now_data["image"] = Image.open(image_path)
now_data["question"] = row["query"]
now_data["answer"] = row["label"]
yield index, now_data
if __name__ == "__main__":
from datasets import load_dataset
data = load_dataset(
"/home/yhzhang/lmms-eval/lmms_eval/tasks/chartqa/upload_chartqa.py",
)
data.push_to_hub("lmms-lab/chartqa", private=True)
|