Upload folder using huggingface_hub

e88bb91 verified 2 months ago

6.68 kB

	"""
	scripts/additional-datasets/lrv_instruct.py

	Standalone script for pre-processing the LRV-Instruct data (including the chart/diagram reasoning split). This isn't
	full conversational chat data, but rather each example has an input prompt and output response; we'll use this structure
	to format the data equivalently to the LLaVa-v1.5 dataset.

	In general, LRV Instruct provides both positive and negative examples -- where a negative example is a question or
	instruction that is not answerable or irrelevant; the goal of this dataset is to reduce hallucinations in VLMs.

	This script downloads the raw instruct data (three different JSON files), as well as the image files; the non-chart
	images come from Visual Genome, but are hosted separately by the LRV Instruct authors and use different image IDs, so
	we're downloading this data (again) for simplicity. The chart images come from the LRV Instruct authors, and are sourced
	from statista.com. All file URLS are here: https://github.com/FuxiaoLiu/LRV-Instruction/blob/main/download.txt#L20

	Note that we are using the coordinate-free data (due to noted inaccuracies in the original coordinates).

	Make sure to download the images first to `data/download/llava-v1.5-instruct/lrv`
	=> cd data/download/llava-v1.5-instruct/lrv
	=> [Visual Genome] gdown https://drive.google.com/uc?id=1k9MNV-ImEV9BYEOeLEIb4uGEUZjd3QbM
	=> `tar -xvf image.tar.gz; mv image lrv-vg; rm image.tar.gz`
	=> [Chart Data] gdown https://drive.google.com/uc?id=1Dey-undzW2Nl21CYLFSkP_Y4RrfRJkYd
	=> `unzip chart_image.zip; rm -rf __MACOSX; mv chart_image lrv-chart; rm chart_image.zip`

	Download the raw JSON files to the same directory - `data/download/llava-v1.5-instruct/lrv`
	=> [LRV Instruct Pt. 1] gdown https://drive.google.com/uc?id=1pWkxE2kqpys1VdwBi99ZXN6-XY5SqhwU
	=> `filter_cap1.json`
	=> [LRV Instruct Pt. II] gdown https://drive.google.com/uc?id=1NTxkuRPlvDn7aWaJpK_yb0p5r0cxPLNZ
	=> `filter_cap_more1.json`
	=> [Chart Instruct] gdown https://drive.google.com/uc?id=13j2U-ectsYGR92r6J5hPdhT8T5ezItHF
	=> `chart_release_update.json`

	References: "Mitigating Hallucination in Large Multi-Modal Models via Robust Instruction Tuning"
	=> Paper: https://arxiv.org/abs/2306.14565
	=> Github / Data: https://github.com/FuxiaoLiu/LRV-Instruction
	"""

	import json
	import random
	from pathlib import Path

	from tqdm import tqdm

	# === Constants ===
	BASE_DIR = Path("data/download/llava-v1.5-instruct")
	LRV_DIR = BASE_DIR / "lrv"

	VG_JSON_FILES, VG_IMG_DIR = [LRV_DIR / "filter_cap1.json", LRV_DIR / "filter_cap_more1.json"], LRV_DIR / "lrv-vg"
	CHART_JSON_FILE, CHART_IMG_DIR = LRV_DIR / "chart_release_update.json", LRV_DIR / "lrv-chart"

	# JSON Files for "merged" variants fo the dataset (with `llava_v1_5_mix665k.json` and `llava_v1_5_lvis4v_mix888k.json`
	BASE_JSON_FILE = BASE_DIR / "llava_v1_5_mix665k.json"
	BASE_LVIS_JSON_FILE = BASE_DIR / "llava_v1_5_lvis4v_mix888k.json"

	MERGED_BASE_LRV_JSON_FILE = BASE_DIR / "llava_v1_5_lrv_mix1008k.json"
	MERGED_BASE_LVIS_LRV_JSON_FILE = BASE_DIR / "llava_v1_5_lvis4v_lrv_mix1231k.json"


	def build_lrv_instruct() -> None:
	print("[*] Downloading and Formatting `LRV-Instruct` Dataset!")

	# Set Random Seed
	random.seed(7)

	# Open VG JSON Files
	vg_examples = []
	for fn in VG_JSON_FILES:
	with open(fn, "r") as f:
	vg_examples.extend(json.load(f))

	# Iterate through VG Examples & Verify Image Existence
	for example in tqdm(vg_examples, desc="[*] Verifying all VG Images in LRV Instruct"):
	image_id = example["image_id"]
	assert (VG_IMG_DIR / f"{image_id}.jpg").exists(), f"Missing Image `{image_id}.jpg`"

	# Open Chart JSON File
	with open(CHART_JSON_FILE, "r") as f:
	chart_examples = json.load(f)

	# Iterate through Chart Examples & Verify Image Existence
	for example in tqdm(chart_examples, desc="[*] Verifying all Chart Images in LRV Instruct"):
	image_path = example["image_id"]
	assert (CHART_IMG_DIR / image_path).exists(), f"Missing Image `{image_path}`"

	# Reformat VG Examples as LLaVa "Chat" Style => List[Entry] where each Entry is a Dictionary:
	# => "id": str
	# => "image": str -- Relative path from `BASE_DIR`
	# => "conversations: List[Turn] where Turn is a Dictionary:
	# => {"from": "human", "value": "<image>\n{VG_EXAMPLE['question']}"}
	# => {"from": "gpt", "value": "{VG_EXAMPLE['answer']}"}
	vg_chat_json = []
	for vg_example in tqdm(vg_examples, desc="[*] Converting all VG Examples to LLaVa Format"):
	vg_chat_json.append(
	{
	"id": vg_example["image_id"],
	"image": f"lrv/lrv-vg/{vg_example['image_id']}.jpg",
	"conversations": [
	{"from": "human", "value": f"<image>\n{vg_example['question'].strip()}"},
	{"from": "gpt", "value": vg_example["answer"].strip()},
	],
	}
	)

	# Reformat Chart Examples as LLaVa "Chat" Style
	chart_chat_json = []
	for chart_example in tqdm(chart_examples, desc="[*] Converting all Chart Examples to LLaVa Format"):
	chart_chat_json.append(
	{
	"id": Path(chart_example["image_id"]).stem,
	"image": f"lrv/lrv-chart/{chart_example['image_id']}",
	"conversations": [
	{"from": "human", "value": f"<image>\n{chart_example['question'].strip()}"},
	{"from": "gpt", "value": chart_example["answer"].strip()},
	],
	}
	)

	# Merge and Create Full LRV Chat Data =>> Total of 342,799 Examples
	lrv_data = vg_chat_json + chart_chat_json

	# Create Stacked Datasets =>> Shuffle for Good Measure!
	print("[*] Loading LLaVa v1.5 Data!")
	with open(BASE_JSON_FILE, "r") as f:
	llava_v15_data = json.load(f)

	# Combine & Shuffle & Write
	llava_lrv_data = llava_v15_data + lrv_data

	random.shuffle(llava_lrv_data)
	random.shuffle(llava_lrv_data)
	random.shuffle(llava_lrv_data)

	with open(MERGED_BASE_LRV_JSON_FILE, "w") as f:
	json.dump(llava_lrv_data, f)

	print("[*] Loading LLaVa v1.5 + LVIS-4V Instruct Data!")
	with open(BASE_LVIS_JSON_FILE, "r") as f:
	llava_v15_lvis_data = json.load(f)

	# Combine & Shuffle & Write
	full_data = llava_v15_lvis_data + lrv_data

	random.shuffle(full_data)
	random.shuffle(full_data)
	random.shuffle(full_data)

	with open(MERGED_BASE_LVIS_LRV_JSON_FILE, "w") as f:
	json.dump(full_data, f)


	if __name__ == "__main__":
	build_lrv_instruct()