File size: 1,061 Bytes
625a17f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import json
import cv2
import os
from tqdm import tqdm

image_root = '/path/to/llava/image/root'
coco_root = '/path/to/coco/train2017'
filter_list = []
with open('/path/to/llava/image/root/llava_v1_5_mix665k.json') as f:
    data = json.load(f)
for data_ in tqdm(data):
    if 'image' in data_:
        image_path = data_['image']
        if 'coco' in image_path:

            image_path = os.path.basename(image_path)
            file_name = os.path.join(coco_root, image_path)
        else:
            file_name = os.path.join(image_root, image_path)
        if os.path.exists(file_name):
            img = cv2.imread(file_name)
            if img is not None:
                filter_list.append(data_)
            else:
                print(f'cant open {file_name}')
        else:
            print(f'cant find {file_name}')
print(f'after filter, data length is {len(filter_list)}')
with open('/home/hk/yyma/data/mm_data_zem/LLaVA-Instruct-150K/llava_v1_5_mix665k_onlyMM_filtered.json','w') as f:
    json.dump(filter_list,f)