| | import os |
| | import shutil |
| | import json |
| | import random |
| |
|
| | |
| | prompt_for_image = [ |
| | "Describe the image concisely.", |
| | "Provide a brief description of the given image.", |
| | "Offer a succinct explanation of the picture presented.", |
| | "Summarize the visual content of the image." |
| | "Give a short and clear explanation of the subsequent image.", |
| | "Share a concise interpretation of the image provided.", |
| | "Present a compact description of the photo's key features.", |
| | "Relay a brief, clear account of the picture shown.", |
| | "Render a clear and concise summary of the photo.", |
| | "Write a terse but informative summary of the picture.", |
| | "Create a compact narrative representing the image presented.", |
| | ] |
| |
|
| | |
| | source_folder = "/mnt/petrelfs/zhuchenglin/diffusion/images_large1" |
| | |
| | target_folder = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain/images" |
| | |
| | target_anno_folder = "/mnt/petrelfs/zhuchenglin/LLaVA/playground/data/LLaVA-Pretrain" |
| | |
| | annotations_coco_path = ( |
| | "/mnt/petrelfs/zhuchenglin/diffusion/coco/annotations/captions_train2017.json" |
| | ) |
| | with open(annotations_coco_path, "r") as f: |
| | annotations = json.load(f) |
| |
|
| | new_annotations = [] |
| | for index, annotation in enumerate(annotations["annotations"][:500000]): |
| | print(index) |
| | |
| | folder_index = 900 + (index // 10000) |
| | target_subfolder = f"{folder_index:05d}" |
| |
|
| | |
| | target_image_name = f"{folder_index:05d}{index % 10000:04d}.jpg" |
| | target_image_path = os.path.join(target_folder, target_subfolder, target_image_name) |
| | if not os.path.exists(os.path.join(target_folder, target_subfolder)): |
| | os.makedirs(os.path.join(target_folder, target_subfolder)) |
| |
|
| | |
| | source_image_path = os.path.join(source_folder, f"{index}.jpg") |
| | if os.path.exists(source_image_path): |
| | shutil.copy(source_image_path, target_image_path) |
| |
|
| | random_prompt = random.choice(prompt_for_image) |
| | new_annotation = { |
| | "id": f"{target_subfolder}{index % 10000:04d}", |
| | "image": f"{target_subfolder}/{target_image_name}", |
| | "conversations": [ |
| | {"from": "human", "value": f"{random_prompt}\n<image>"}, |
| | {"from": "gpt", "value": annotation["caption"]}, |
| | ], |
| | } |
| | new_annotations.append(new_annotation) |
| |
|
| | json_file_path = os.path.join(target_anno_folder, "coco_annotations_500k.json") |
| | with open(json_file_path, "w") as json_file: |
| | json.dump(new_annotations, json_file, indent=4) |
| |
|