|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
|
import os |
|
|
|
|
|
import datasets |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
_CITATION = """\ |
|
|
@inproceedings{masry-etal-2022-chartqa, |
|
|
title = "{C}hart{QA}: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning", |
|
|
author = "Masry, Ahmed and |
|
|
Long, Do and |
|
|
Tan, Jia Qing and |
|
|
Joty, Shafiq and |
|
|
Hoque, Enamul", |
|
|
booktitle = "Findings of the Association for Computational Linguistics: ACL 2022", |
|
|
month = may, |
|
|
year = "2022", |
|
|
address = "Dublin, Ireland", |
|
|
publisher = "Association for Computational Linguistics", |
|
|
url = "https://aclanthology.org/2022.findings-acl.177", |
|
|
doi = "10.18653/v1/2022.findings-acl.177", |
|
|
pages = "2263--2279", |
|
|
} |
|
|
""" |
|
|
_DESCRIPTION = "A largescale benchmark covering 9.6K human-written questions as well as 23.1K questions generated from human-written chart summaries." |
|
|
|
|
|
|
|
|
def get_builder_config(VERSION): |
|
|
builder_config = [ |
|
|
datasets.BuilderConfig( |
|
|
name=f"ChartQA", |
|
|
version=VERSION, |
|
|
description=f"ChartQA", |
|
|
) |
|
|
] |
|
|
return builder_config |
|
|
|
|
|
|
|
|
dataset_features = { |
|
|
"type": datasets.Value("string"), |
|
|
"question": datasets.Value("string"), |
|
|
"answer": datasets.Value("string"), |
|
|
"image": datasets.Image(), |
|
|
} |
|
|
|
|
|
|
|
|
class ChartQA(datasets.GeneratorBasedBuilder): |
|
|
VERSION = datasets.Version("1.0.0") |
|
|
|
|
|
BUILDER_CONFIGS = get_builder_config(VERSION) |
|
|
|
|
|
def _info(self): |
|
|
features = datasets.Features(dataset_features) |
|
|
return datasets.DatasetInfo( |
|
|
|
|
|
description=_DESCRIPTION, |
|
|
|
|
|
features=features, |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
citation=_CITATION, |
|
|
) |
|
|
|
|
|
def _split_generators(self, dl_manager): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
image_path = "/home/yhzhang/ChartQA/ChartQA Dataset/test/png/" |
|
|
human_annotation_path = "/home/yhzhang/ChartQA/ChartQA Dataset/test/test_human.json" |
|
|
augmented_annotation_path = "/home/yhzhang/ChartQA/ChartQA Dataset/test/test_augmented.json" |
|
|
return [ |
|
|
datasets.SplitGenerator( |
|
|
name=datasets.Split.TEST, |
|
|
gen_kwargs={ |
|
|
"human_annotation": human_annotation_path, |
|
|
"augmented_annotation": augmented_annotation_path, |
|
|
"images": image_path, |
|
|
}, |
|
|
), |
|
|
] |
|
|
|
|
|
|
|
|
def _generate_examples(self, human_annotation, augmented_annotation, images): |
|
|
|
|
|
with open(human_annotation, encoding="utf-8") as f: |
|
|
human_data = json.load(f) |
|
|
with open(augmented_annotation, encoding="utf-8") as f: |
|
|
augmented_data = json.load(f) |
|
|
index = -1 |
|
|
for data in [human_data, augmented_data]: |
|
|
for row in data["data"]: |
|
|
index += 1 |
|
|
image_path = os.path.join(images, row["imgname"]) |
|
|
now_data = {} |
|
|
now_data["type"] = "human_test" if data == human_data else "augmented_test" |
|
|
now_data["image"] = Image.open(image_path) |
|
|
now_data["question"] = row["query"] |
|
|
now_data["answer"] = row["label"] |
|
|
yield index, now_data |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
from datasets import load_dataset |
|
|
|
|
|
data = load_dataset( |
|
|
"/home/yhzhang/lmms-eval/lmms_eval/tasks/chartqa/upload_chartqa.py", |
|
|
) |
|
|
data.push_to_hub("lmms-lab/chartqa", private=True) |
|
|
|