llm_cp2 / src /lmms-eval /lmms_eval /tasks /chartqa /upload_chartqa.py

Upload folder using huggingface_hub

b0c0df0 verified about 1 month ago

5.37 kB

	# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	import json
	import os

	import datasets
	from PIL import Image

	# Find for instance the citation on arxiv or on the dataset repo/website
	_CITATION = """\
	@inproceedings{masry-etal-2022-chartqa,
	title = "{C}hart{QA}: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning",
	author = "Masry, Ahmed and
	Long, Do and
	Tan, Jia Qing and
	Joty, Shafiq and
	Hoque, Enamul",
	booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
	month = may,
	year = "2022",
	address = "Dublin, Ireland",
	publisher = "Association for Computational Linguistics",
	url = "https://aclanthology.org/2022.findings-acl.177",
	doi = "10.18653/v1/2022.findings-acl.177",
	pages = "2263--2279",
	}
	"""
	_DESCRIPTION = "A largescale benchmark covering 9.6K human-written questions as well as 23.1K questions generated from human-written chart summaries."


	def get_builder_config(VERSION):
	builder_config = [
	datasets.BuilderConfig(
	name=f"ChartQA",
	version=VERSION,
	description=f"ChartQA",
	)
	]
	return builder_config


	dataset_features = {
	"type": datasets.Value("string"),
	"question": datasets.Value("string"),
	"answer": datasets.Value("string"),
	"image": datasets.Image(),
	}


	class ChartQA(datasets.GeneratorBasedBuilder):
	VERSION = datasets.Version("1.0.0")

	BUILDER_CONFIGS = get_builder_config(VERSION)

	def _info(self):
	features = datasets.Features(dataset_features)
	return datasets.DatasetInfo(
	# This is the description that will appear on the datasets page.
	description=_DESCRIPTION,
	# This defines the different columns of the dataset and their types
	features=features, # Here we define them above because they are different between the two configurations
	# If there's a common (input, target) tuple from the features, uncomment supervised_keys line below and
	# specify them. They'll be used if as_supervised=True in builder.as_dataset.
	# supervised_keys=("sentence", "label"),
	# Homepage of the dataset for documentation
	# Citation for the dataset
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	# If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

	# dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLS
	# It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
	# By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
	image_path = "/home/yhzhang/ChartQA/ChartQA Dataset/test/png/"
	human_annotation_path = "/home/yhzhang/ChartQA/ChartQA Dataset/test/test_human.json"
	augmented_annotation_path = "/home/yhzhang/ChartQA/ChartQA Dataset/test/test_augmented.json"
	return [
	datasets.SplitGenerator(
	name=datasets.Split.TEST,
	gen_kwargs={
	"human_annotation": human_annotation_path,
	"augmented_annotation": augmented_annotation_path,
	"images": image_path,
	},
	),
	]

	# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
	def _generate_examples(self, human_annotation, augmented_annotation, images):
	# The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
	with open(human_annotation, encoding="utf-8") as f:
	human_data = json.load(f)
	with open(augmented_annotation, encoding="utf-8") as f:
	augmented_data = json.load(f)
	index = -1
	for data in [human_data, augmented_data]:
	for row in data["data"]:
	index += 1
	image_path = os.path.join(images, row["imgname"])
	now_data = {}
	now_data["type"] = "human_test" if data == human_data else "augmented_test"
	now_data["image"] = Image.open(image_path)
	now_data["question"] = row["query"]
	now_data["answer"] = row["label"]
	yield index, now_data


	if __name__ == "__main__":
	from datasets import load_dataset

	data = load_dataset(
	"/home/yhzhang/lmms-eval/lmms_eval/tasks/chartqa/upload_chartqa.py",
	)
	data.push_to_hub("lmms-lab/chartqa", private=True)