Spaces:

rnnandi
/

convert-gemma3-to-onnx

Running

File size: 9,265 Bytes

ca97aa9



import {
    ImageProcessor,
} from "../../base/image_processors_utils.js";
import { cat, full, interpolate_4d, slice, stack } from "../../utils/tensor.js";

export class Idefics3ImageProcessor extends ImageProcessor {
    constructor(config) {
        super(config);

        this.do_image_splitting = config.do_image_splitting ?? true;
        this.max_image_size = config.max_image_size;
    }

    /**
     * @typedef {import('../../utils/image.js').RawImage} RawImage
     * @typedef {import('../../utils/tensor.js').Tensor} Tensor
     */

    /**
     * Calculate size to resize images to, to be multiples of `vision_encoder_max_size` while preserving the aspect ratio.
     * @param {Tensor} pixel_values Tensor of the image to resize.
     * @param {number} vision_encoder_max_size Maximum size of the output image. If the image is larger than this size,
     * it will be split into patches of this size, and the original image will be concatenated with the patches, resized to max_size.
     */
    get_resize_for_vision_encoder(pixel_values, vision_encoder_max_size) {
        let [height, width] = pixel_values.dims.slice(-2);

        const aspect_ratio = width / height;
        if (width >= height) {
            width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
            height = Math.floor(width / aspect_ratio);
            height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
        } else {
            height = Math.ceil(height / vision_encoder_max_size) * vision_encoder_max_size;
            width = Math.floor(height * aspect_ratio);
            width = Math.ceil(width / vision_encoder_max_size) * vision_encoder_max_size;
        }
        return { height, width };
    }

    /** @param {RawImage|RawImage[]|RawImage[][]} images */
    async _call(images, {
        do_image_splitting = null,
        return_row_col_info = false,
    } = {}) {

        /** @type {RawImage[][]} */
        let batched_2d_images;
        if (!Array.isArray(images)) {
            batched_2d_images = [[images]];
        } else {
            if (images.length === 0 || !images[0]) {
                throw new Error("No images provided.");
            }
            if (!Array.isArray(images[0])) {
                batched_2d_images = [/** @type {RawImage[]} */(images)];
            } else {
                batched_2d_images = /** @type {RawImage[][]} */(images);
            }
        }

        // List of tensors, each with shape [patches, channels, height, width]
        let all_pixel_values = [];
        let images_list_rows = [];
        let images_list_cols = [];

        const original_sizes = [];
        const reshaped_input_sizes = [];
        for (const image_batch of batched_2d_images) {

            let images_list = await Promise.all(image_batch.map(x => this.preprocess(x)));

            // Original sizes of images
            original_sizes.push(...images_list.map(x => x.original_size));

            // Reshaped sizes of images, before padding or cropping
            reshaped_input_sizes.push(...images_list.map(x => x.reshaped_input_size));

            // Convert images to 4D tensors for easier processing
            images_list.forEach(x => x.pixel_values.unsqueeze_(0));

            const { longest_edge } = this.max_image_size;

            /** @type {Tensor[]} */
            let images_tensor;
            if (do_image_splitting ?? this.do_image_splitting) {
                let image_rows = new Array(images_list.length);
                let image_cols = new Array(images_list.length);

                // We first resize both height and width of each image to the nearest max_image_size multiple, disregarding the aspect ratio
                images_tensor = await Promise.all(images_list.map(async (x, i) => {
                    const new_size = this.get_resize_for_vision_encoder(x.pixel_values, longest_edge);

                    const resized = await interpolate_4d(x.pixel_values, {
                        size: [new_size.height, new_size.width],
                    });

                    const { frames, num_splits_h, num_splits_w } = await this.split_image(resized, this.max_image_size);
                    image_rows[i] = num_splits_h;
                    image_cols[i] = num_splits_w;
                    return cat(frames, 0);
                }));

                images_list_rows.push(image_rows);
                images_list_cols.push(image_cols);

            } else {
                /** @type {[number, number]} */
                const size = [longest_edge, longest_edge];
                images_tensor = await Promise.all(
                    images_list.map(x => interpolate_4d(x.pixel_values, { size }))
                );

                images_list_rows.push(new Array(images_list.length).fill(0));
                images_list_cols.push(new Array(images_list.length).fill(0));
            }

            all_pixel_values.push(cat(images_tensor, 0));
        }

        const batch_size = all_pixel_values.length;
        const [n, c, h, w] = all_pixel_values[0].dims;

        // Stack pixel values
        let pixel_values;
        let pixel_attention_mask;
        if (batch_size === 1) {
            pixel_values = all_pixel_values[0].unsqueeze_(0);
            pixel_attention_mask = full([batch_size, n, h, w], true);
        } else {
            // Add padding (if necessary) to images with less patches than the maximum number of patches
            const max_num_patches = Math.max(...all_pixel_values.map(x => x.dims.at(0)));

            pixel_attention_mask = full([batch_size, max_num_patches, h, w], true);
            const pixel_attention_mask_data = pixel_attention_mask.data;
            const pixel_attention_mask_stride = max_num_patches * h * w;
            for (let i = 0; i < batch_size; ++i) {
                const num_patches = all_pixel_values[i].dims[0];
                if (num_patches < max_num_patches) {
                    all_pixel_values[i] = cat([
                        all_pixel_values[i],
                        full([max_num_patches - num_patches, c, h, w], 0),
                    ], 0);

                    const start_offset = i * pixel_attention_mask_stride + num_patches * h * w;
                    const end_offset = (i + 1) * pixel_attention_mask_stride;

                    // @ts-ignore
                    pixel_attention_mask_data.fill(false, start_offset, end_offset);
                }
            }
            pixel_values = stack(all_pixel_values, 0);
        }

        return {
            pixel_values,
            pixel_attention_mask,

            original_sizes,
            reshaped_input_sizes,
            ...(
                return_row_col_info
                    ? { rows: images_list_rows, cols: images_list_cols }
                    : {}
            ),
        }
    }

    async split_image(pixel_values, { longest_edge }) {
        const max_height = longest_edge;
        const max_width = longest_edge;

        const frames = [];

        const [height, width] = pixel_values.dims.slice(-2);

        let num_splits_h = 0, num_splits_w = 0;

        if (height > max_height || width > max_width) {
            // Calculate the number of splits
            num_splits_h = Math.ceil(height / max_height);
            num_splits_w = Math.ceil(width / max_width);

            // Calculate the optimal width and height for the sub-images
            const optimal_height = Math.ceil(height / num_splits_h);
            const optimal_width = Math.ceil(width / num_splits_w);

            // Iterate through each row and column
            for (let r = 0; r < num_splits_h; ++r) {
                for (let c = 0; c < num_splits_w; ++c) {
                    let start_x, start_y, end_x, end_y;
                    if (r === num_splits_h - 1) { // At bottom
                        start_y = height - optimal_height;
                        end_y = height;
                    } else {
                        start_y = r * optimal_height;
                        end_y = (r + 1) * optimal_height;
                    }
                    if (c === num_splits_w - 1) { // At right
                        start_x = width - optimal_width;
                        end_x = width;
                    } else {
                        start_x = c * optimal_width;
                        end_x = (c + 1) * optimal_width;
                    }

                    const starts = [start_y, start_x];
                    const ends = [end_y, end_x];

                    const patch = await slice(pixel_values, starts, ends, [2, 3]);
                    frames.push(patch);
                }
            }

            // Resize the global image to match max dimensions for memory efficiency
            const global_image_height = max_height;
            const global_image_width = max_width;

            if (height !== global_image_height || width !== global_image_width) {
                pixel_values = await interpolate_4d(pixel_values, {
                    size: [global_image_height, global_image_width],
                })
            }
        }

        frames.push(pixel_values);

        return { frames, num_splits_h, num_splits_w };
    }
}