zooblastlbz
/

id-align

Model card Files Files and versions

id-align / scripts /archived /quick_check.py

zooblastlbz's picture

Upload folder using huggingface_hub

a9e1e1a verified 7 months ago

history blame contribute delete

2 kB

	import json
	import os
	import argparse
	from tqdm import tqdm
	import yaml


	def check_missing_images(json_path, images_folder):
	data = json.load(open(json_path, "r"))
	missing_data = []

	for i, d in enumerate(tqdm(data)):
	image = d["image"] if "image" in d else ""
	if image != "":
	path = os.path.join(images_folder, image)
	if not os.path.exists(path):
	print(f"Missing image: {path}")
	missing_data.append(d)

	return missing_data


	def read_yaml_to_llava_data(yaml_path, images_folder):
	print(f"Reading YAML file: {yaml_path}")
	with open(yaml_path, "r") as f:
	data = yaml.safe_load(f)

	llava_json_paths = data["datasets"]
	for item in llava_json_paths:
	json_path = item["json_path"]
	missing_data = check_missing_images(json_path, images_folder)
	if len(missing_data) > 0:
	print(f"Missing images in {json_path}:")
	for d in missing_data:
	print(d)


	def direct_check_llava_data(json_path, images_folder):
	missing_data = check_missing_images(json_path, images_folder)
	if len(missing_data) > 0:
	print(f"Missing images in {json_path}:")
	for d in missing_data:
	print(d)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Check for missing images in dataset.")
	parser.add_argument("--yaml_path", type=str, default="", help="Path to the YAML file containing the dataset.")
	parser.add_argument("--json_path", type=str, default="", help="Path to the JSON file containing the dataset.")
	parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing the images.")

	args = parser.parse_args()

	if args.json_path != "":
	direct_check_llava_data(args.json_path, args.images_folder)
	elif args.yaml_path != "":
	read_yaml_to_llava_data(args.yaml_path, args.images_folder)