File size: 4,634 Bytes
a9e1e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import json
import os
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np


def load_data(json_path):
    with open(json_path, "r") as f:
        return json.load(f)


def filter_data(data):
    # filtered_data = [item for item in data if "image" in item and "text" in item["image"]]
    filtered_data = [item for item in data if "image" in item]
    return filtered_data


from multiprocessing import Pool
import functools


def calculate_image_dimension(item, images_folder):
    image_path = os.path.join(images_folder, item["image"])
    try:
        with Image.open(image_path) as img:
            width, height = img.size
            return width, height
    except Exception as e:
        print(f"Error opening {image_path}: {e}")
        return None, None


def calculate_image_dimensions_multiprocess(filtered_data, images_folder, num_processes=256):
    with Pool(num_processes) as p:
        dimensions = list(tqdm(p.imap(functools.partial(calculate_image_dimension, images_folder=images_folder), filtered_data), total=len(filtered_data), desc="Calculating image dimensions"))
    widths, heights = zip(*[dim for dim in dimensions if dim[0] is not None])
    return list(widths), list(heights)


def tokenize(text):
    return text.split()


def calculate_tokenized_lengths(data):
    lengths = []
    for item in tqdm(data, desc="Tokenizing conversations"):
        for conversation in item["conversations"]:
            tokenized_value = tokenize(conversation["value"])
            lengths.append(len(tokenized_value))
    return lengths


import argparse


def main():
    parser = argparse.ArgumentParser(description="Process data for LLaVA_Next project.")
    parser.add_argument("--json_path", type=str, help="Path to the JSON file containing data.")
    parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing images.")
    args = parser.parse_args()

    llava_instruct_name = args.json_path.split("/")[-1].replace(".json", "")
    json_path = args.json_path
    llava_instruct_name = os.path.basename(json_path).replace(".json", "")
    images_folder = args.images_folder

    data = load_data(json_path)
    filtered_data = filter_data(data)

    if len(filtered_data) != 0:
        print(f"Total data items: {len(data)}, Filtered data items: {len(filtered_data)}")
        widths, heights = calculate_image_dimensions_multiprocess(filtered_data, images_folder)
        max_width = max(widths)
        max_height = max(heights)
        print(f"Max width: {max_width}, Max height: {max_height}")

    tokenized_lengths = calculate_tokenized_lengths(data)

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24, 12))

    if len(filtered_data) != 0:
        # Plot 2D histogram
        if min(widths) == max(widths):
            widths_bins = [min(widths), max(widths) + 1]
        else:
            widths_bins = np.arange(min(widths), max(widths) + 100, 100)

        if min(heights) == max(heights):
            heights_bins = [min(heights), max(heights) + 1]
        else:
            heights_bins = np.arange(min(heights), max(heights) + 100, 100)

        h, xedges, yedges, image = ax1.hist2d(widths, heights, bins=[widths_bins, heights_bins], cmap=plt.cm.jet, density=True)
        fig.colorbar(image, ax=ax1)
        ax1.set_xlabel("Width")
        ax1.set_ylabel("Height")
        ax1.set_title(f"dist_{llava_instruct_name}_2d_w_h\nMax width: {max(widths)}, Max height: {max(heights)}", fontsize=10)

    # Plot histogram
    hist, bin_edges = np.histogram(tokenized_lengths, bins=np.arange(0, max(tokenized_lengths) + 10, 100))
    bins = np.arange(0, max(tokenized_lengths) + 10, 100)
    ax2.bar(bin_edges[:-1], hist, width=7, edgecolor="black", log=True)

    # Display every nth label on the x-axis
    n = 8  # Adjust this value to control the number of labels displayed
    ticks = bins[::n]
    tick_labels = [int(tick) for tick in ticks]
    ax2.set_xticks(ticks)
    ax2.set_xticklabels(tick_labels, rotation=90, fontsize=8)

    ax2.set_xlim(min(bin_edges), max(bin_edges))
    ax2.set_xlabel("Tokenized Length")
    ax2.set_ylabel("Count (log scale)")
    ax2.set_title(f"dist_{llava_instruct_name}_tokenized_length", fontsize=8)

    plt.tight_layout()
    plt.savefig(f"/mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next/notebooks/sft_data/dist_{llava_instruct_name}_combined.png")
    print(f"Plots saved to /mnt/bn/vl-research/workspace/boli01/projects/LLaVA_Next/notebooks/sft_data/dist_{llava_instruct_name}_combined.png")


if __name__ == "__main__":
    main()