| import json | |
| import os | |
| import argparse | |
| from tqdm import tqdm | |
| import yaml | |
| def check_missing_images(json_path, images_folder): | |
| data = json.load(open(json_path, "r")) | |
| missing_data = [] | |
| for i, d in enumerate(tqdm(data)): | |
| image = d["image"] if "image" in d else "" | |
| if image != "": | |
| path = os.path.join(images_folder, image) | |
| if not os.path.exists(path): | |
| print(f"Missing image: {path}") | |
| missing_data.append(d) | |
| return missing_data | |
| def read_yaml_to_llava_data(yaml_path, images_folder): | |
| print(f"Reading YAML file: {yaml_path}") | |
| with open(yaml_path, "r") as f: | |
| data = yaml.safe_load(f) | |
| llava_json_paths = data["datasets"] | |
| for item in llava_json_paths: | |
| json_path = item["json_path"] | |
| missing_data = check_missing_images(json_path, images_folder) | |
| if len(missing_data) > 0: | |
| print(f"Missing images in {json_path}:") | |
| for d in missing_data: | |
| print(d) | |
| def direct_check_llava_data(json_path, images_folder): | |
| missing_data = check_missing_images(json_path, images_folder) | |
| if len(missing_data) > 0: | |
| print(f"Missing images in {json_path}:") | |
| for d in missing_data: | |
| print(d) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Check for missing images in dataset.") | |
| parser.add_argument("--yaml_path", type=str, default="", help="Path to the YAML file containing the dataset.") | |
| parser.add_argument("--json_path", type=str, default="", help="Path to the JSON file containing the dataset.") | |
| parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing the images.") | |
| args = parser.parse_args() | |
| if args.json_path != "": | |
| direct_check_llava_data(args.json_path, args.images_folder) | |
| elif args.yaml_path != "": | |
| read_yaml_to_llava_data(args.yaml_path, args.images_folder) | |