| """ |
| scripts/additional-datasets/lrv_instruct.py |
| |
| Standalone script for pre-processing the LRV-Instruct data (including the chart/diagram reasoning split). This isn't |
| full conversational chat data, but rather each example has an input prompt and output response; we'll use this structure |
| to format the data equivalently to the LLaVa-v1.5 dataset. |
| |
| In general, LRV Instruct provides *both positive and negative* examples -- where a negative example is a question or |
| instruction that is *not answerable* or *irrelevant*; the goal of this dataset is to reduce hallucinations in VLMs. |
| |
| This script downloads the raw instruct data (three different JSON files), as well as the image files; the non-chart |
| images come from Visual Genome, but are hosted separately by the LRV Instruct authors and use different image IDs, so |
| we're downloading this data (again) for simplicity. The chart images come from the LRV Instruct authors, and are sourced |
| from statista.com. All file URLS are here: https://github.com/FuxiaoLiu/LRV-Instruction/blob/main/download.txt#L20 |
| |
| Note that we are using the *coordinate-free* data (due to noted inaccuracies in the original coordinates). |
| |
| Make sure to download the images first to `data/download/llava-v1.5-instruct/lrv` |
| => cd data/download/llava-v1.5-instruct/lrv |
| => [Visual Genome] gdown https://drive.google.com/uc?id=1k9MNV-ImEV9BYEOeLEIb4uGEUZjd3QbM |
| => `tar -xvf image.tar.gz; mv image lrv-vg; rm image.tar.gz` |
| => [Chart Data] gdown https://drive.google.com/uc?id=1Dey-undzW2Nl21CYLFSkP_Y4RrfRJkYd |
| => `unzip chart_image.zip; rm -rf __MACOSX; mv chart_image lrv-chart; rm chart_image.zip` |
| |
| Download the raw JSON files to the same directory - `data/download/llava-v1.5-instruct/lrv` |
| => [LRV Instruct Pt. 1] gdown https://drive.google.com/uc?id=1pWkxE2kqpys1VdwBi99ZXN6-XY5SqhwU |
| => `filter_cap1.json` |
| => [LRV Instruct Pt. II] gdown https://drive.google.com/uc?id=1NTxkuRPlvDn7aWaJpK_yb0p5r0cxPLNZ |
| => `filter_cap_more1.json` |
| => [Chart Instruct] gdown https://drive.google.com/uc?id=13j2U-ectsYGR92r6J5hPdhT8T5ezItHF |
| => `chart_release_update.json` |
| |
| References: "Mitigating Hallucination in Large Multi-Modal Models via Robust Instruction Tuning" |
| => Paper: https://arxiv.org/abs/2306.14565 |
| => Github / Data: https://github.com/FuxiaoLiu/LRV-Instruction |
| """ |
|
|
| import json |
| import random |
| from pathlib import Path |
|
|
| from tqdm import tqdm |
|
|
| |
| BASE_DIR = Path("data/download/llava-v1.5-instruct") |
| LRV_DIR = BASE_DIR / "lrv" |
|
|
| VG_JSON_FILES, VG_IMG_DIR = [LRV_DIR / "filter_cap1.json", LRV_DIR / "filter_cap_more1.json"], LRV_DIR / "lrv-vg" |
| CHART_JSON_FILE, CHART_IMG_DIR = LRV_DIR / "chart_release_update.json", LRV_DIR / "lrv-chart" |
|
|
| |
| BASE_JSON_FILE = BASE_DIR / "llava_v1_5_mix665k.json" |
| BASE_LVIS_JSON_FILE = BASE_DIR / "llava_v1_5_lvis4v_mix888k.json" |
|
|
| MERGED_BASE_LRV_JSON_FILE = BASE_DIR / "llava_v1_5_lrv_mix1008k.json" |
| MERGED_BASE_LVIS_LRV_JSON_FILE = BASE_DIR / "llava_v1_5_lvis4v_lrv_mix1231k.json" |
|
|
|
|
| def build_lrv_instruct() -> None: |
| print("[*] Downloading and Formatting `LRV-Instruct` Dataset!") |
|
|
| |
| random.seed(7) |
|
|
| |
| vg_examples = [] |
| for fn in VG_JSON_FILES: |
| with open(fn, "r") as f: |
| vg_examples.extend(json.load(f)) |
|
|
| |
| for example in tqdm(vg_examples, desc="[*] Verifying all VG Images in LRV Instruct"): |
| image_id = example["image_id"] |
| assert (VG_IMG_DIR / f"{image_id}.jpg").exists(), f"Missing Image `{image_id}.jpg`" |
|
|
| |
| with open(CHART_JSON_FILE, "r") as f: |
| chart_examples = json.load(f) |
|
|
| |
| for example in tqdm(chart_examples, desc="[*] Verifying all Chart Images in LRV Instruct"): |
| image_path = example["image_id"] |
| assert (CHART_IMG_DIR / image_path).exists(), f"Missing Image `{image_path}`" |
|
|
| |
| |
| |
| |
| |
| |
| vg_chat_json = [] |
| for vg_example in tqdm(vg_examples, desc="[*] Converting all VG Examples to LLaVa Format"): |
| vg_chat_json.append( |
| { |
| "id": vg_example["image_id"], |
| "image": f"lrv/lrv-vg/{vg_example['image_id']}.jpg", |
| "conversations": [ |
| {"from": "human", "value": f"<image>\n{vg_example['question'].strip()}"}, |
| {"from": "gpt", "value": vg_example["answer"].strip()}, |
| ], |
| } |
| ) |
|
|
| |
| chart_chat_json = [] |
| for chart_example in tqdm(chart_examples, desc="[*] Converting all Chart Examples to LLaVa Format"): |
| chart_chat_json.append( |
| { |
| "id": Path(chart_example["image_id"]).stem, |
| "image": f"lrv/lrv-chart/{chart_example['image_id']}", |
| "conversations": [ |
| {"from": "human", "value": f"<image>\n{chart_example['question'].strip()}"}, |
| {"from": "gpt", "value": chart_example["answer"].strip()}, |
| ], |
| } |
| ) |
|
|
| |
| lrv_data = vg_chat_json + chart_chat_json |
|
|
| |
| print("[*] Loading LLaVa v1.5 Data!") |
| with open(BASE_JSON_FILE, "r") as f: |
| llava_v15_data = json.load(f) |
|
|
| |
| llava_lrv_data = llava_v15_data + lrv_data |
|
|
| random.shuffle(llava_lrv_data) |
| random.shuffle(llava_lrv_data) |
| random.shuffle(llava_lrv_data) |
|
|
| with open(MERGED_BASE_LRV_JSON_FILE, "w") as f: |
| json.dump(llava_lrv_data, f) |
|
|
| print("[*] Loading LLaVa v1.5 + LVIS-4V Instruct Data!") |
| with open(BASE_LVIS_JSON_FILE, "r") as f: |
| llava_v15_lvis_data = json.load(f) |
|
|
| |
| full_data = llava_v15_lvis_data + lrv_data |
|
|
| random.shuffle(full_data) |
| random.shuffle(full_data) |
| random.shuffle(full_data) |
|
|
| with open(MERGED_BASE_LVIS_LRV_JSON_FILE, "w") as f: |
| json.dump(full_data, f) |
|
|
|
|
| if __name__ == "__main__": |
| build_lrv_instruct() |
|
|