import json import cv2 import os from tqdm import tqdm image_root = '/path/to/llava/image/root' coco_root = '/path/to/coco/train2017' filter_list = [] with open('/path/to/llava/image/root/llava_v1_5_mix665k.json') as f: data = json.load(f) for data_ in tqdm(data): if 'image' in data_: image_path = data_['image'] if 'coco' in image_path: image_path = os.path.basename(image_path) file_name = os.path.join(coco_root, image_path) else: file_name = os.path.join(image_root, image_path) if os.path.exists(file_name): img = cv2.imread(file_name) if img is not None: filter_list.append(data_) else: print(f'cant open {file_name}') else: print(f'cant find {file_name}') print(f'after filter, data length is {len(filter_list)}') with open('/home/hk/yyma/data/mm_data_zem/LLaVA-Instruct-150K/llava_v1_5_mix665k_onlyMM_filtered.json','w') as f: json.dump(filter_list,f)