Spaces:

rnnandi
/

convert-gemma3-to-onnx

Running

File size: 44,279 Bytes

ca97aa9

import { Callable } from "../utils/generic.js";
import { Tensor, interpolate, stack } from "../utils/tensor.js";
import { bankers_round, max, min, softmax } from "../utils/maths.js";
import { RawImage } from "../utils/image.js";
import { calculateReflectOffset } from "../utils/core.js";
import { getModelJSON } from "../utils/hub.js";
import { IMAGE_PROCESSOR_NAME } from '../utils/constants.js';

/**
 * Named tuple to indicate the order we are using is (height x width),
 * even though the Graphics' industry standard is (width x height).
 * @typedef {[height: number, width: number]} HeightWidth
 */


/**
 * @typedef {object} ImageProcessorResult
 * @property {Tensor} pixel_values The pixel values of the batched preprocessed images.
 * @property {HeightWidth[]} original_sizes Array of two-dimensional tuples like [[480, 640]].
 * @property {HeightWidth[]} reshaped_input_sizes Array of two-dimensional tuples like [[1000, 1330]].
 */



/**
 * Helper function to constrain a value to be a multiple of a number.
 * @param {number} val The value to constrain.
 * @param {number} multiple The number to constrain to.
 * @param {number} [minVal=0] The minimum value to constrain to.
 * @param {number} [maxVal=null] The maximum value to constrain to.
 * @returns {number} The constrained value.
 * @private
 */
function constraint_to_multiple_of(val, multiple, minVal = 0, maxVal = null) {
    const a = val / multiple;
    let x = bankers_round(a) * multiple;

    if (maxVal !== null && x > maxVal) {
        x = Math.floor(a) * multiple;
    }

    if (x < minVal) {
        x = Math.ceil(a) * multiple;
    }

    return x;
}

/**
 * Rounds the height and width down to the closest multiple of size_divisibility
 * @param {[number, number]} size The size of the image
 * @param {number} divisor The divisor to use.
 * @returns {[number, number]} The rounded size.
 */
function enforce_size_divisibility([width, height], divisor) {
    return [
        Math.max(Math.floor(width / divisor), 1) * divisor,
        Math.max(Math.floor(height / divisor), 1) * divisor
    ];
}


// Helper functions

/**
 * Converts bounding boxes from center format to corners format.
 * 
 * @param {number[]} arr The coordinate for the center of the box and its width, height dimensions (center_x, center_y, width, height)
 * @returns {number[]} The coodinates for the top-left and bottom-right corners of the box (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
 */
export function center_to_corners_format([centerX, centerY, width, height]) {
    return [
        centerX - width / 2,
        centerY - height / 2,
        centerX + width / 2,
        centerY + height / 2
    ];
}

/**
 * Post-processes the outputs of the model (for object detection).
 * @param {Object} outputs The outputs of the model that must be post-processed
 * @param {Tensor} outputs.logits The logits
 * @param {Tensor} outputs.pred_boxes The predicted boxes.
 * @param {number} [threshold=0.5] The threshold to use for the scores.
 * @param {[number, number][]} [target_sizes=null] The sizes of the original images.
 * @param {boolean} [is_zero_shot=false] Whether zero-shot object detection was performed.
 * @return {Object[]} An array of objects containing the post-processed outputs.
 */
export function post_process_object_detection(outputs, threshold = 0.5, target_sizes = null, is_zero_shot = false) {
    const out_logits = outputs.logits;
    const out_bbox = outputs.pred_boxes;
    const [batch_size, num_boxes, num_classes] = out_logits.dims;

    if (target_sizes !== null && target_sizes.length !== batch_size) {
        throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
    }
    let toReturn = [];
    for (let i = 0; i < batch_size; ++i) {
        let target_size = target_sizes !== null ? target_sizes[i] : null;
        let info = {
            boxes: [],
            classes: [],
            scores: []
        }
        let logits = out_logits[i];
        let bbox = out_bbox[i];

        for (let j = 0; j < num_boxes; ++j) {
            let logit = logits[j];

            let indices = [];
            let probs;
            if (is_zero_shot) {
                // Get indices of classes with high enough probability
                probs = logit.sigmoid().data;
                for (let k = 0; k < probs.length; ++k) {
                    if (probs[k] > threshold) {
                        indices.push(k);
                    }
                }

            } else {
                // Get most probable class
                let maxIndex = max(logit.data)[1];

                if (maxIndex === num_classes - 1) {
                    // This is the background class, skip it
                    continue;
                }
                // Compute softmax over classes
                probs = softmax(logit.data);

                if (probs[maxIndex] < threshold) {
                    continue;
                }
                indices.push(maxIndex);
            }

            for (const index of indices) {

                // Some class has a high enough probability
                /** @type {number[]} */
                let box = bbox[j].data;

                // convert to [x0, y0, x1, y1] format
                box = center_to_corners_format(box)
                if (target_size !== null) {
                    box = box.map((x, i) => x * target_size[(i + 1) % 2])
                }

                info.boxes.push(box);
                info.classes.push(index);
                info.scores.push(probs[index]);
            }
        }
        toReturn.push(info);
    }
    return toReturn;
}


/**
 * Post-processes the outputs of the model (for semantic segmentation).
 * @param {*} outputs Raw outputs of the model.
 * @param {[number, number][]} [target_sizes=null] List of tuples corresponding to the requested final size
 * (height, width) of each prediction. If unset, predictions will not be resized.
 * @returns {{segmentation: Tensor; labels: number[]}[]} The semantic segmentation maps.
 */
export function post_process_semantic_segmentation(outputs, target_sizes = null) {

    const logits = outputs.logits;
    const batch_size = logits.dims[0];

    if (target_sizes !== null && target_sizes.length !== batch_size) {
        throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
    }

    const toReturn = [];
    for (let i = 0; i < batch_size; ++i) {
        const target_size = target_sizes !== null ? target_sizes[i] : null;

        let data = logits[i];

        // 1. If target_size is not null, we need to resize the masks to the target size
        if (target_size !== null) {
            // resize the masks to the target size
            data = interpolate(data, target_size, 'bilinear', false);
        }
        const [height, width] = target_size ?? data.dims.slice(-2);

        const segmentation = new Tensor(
            'int32',
            new Int32Array(height * width),
            [height, width]
        );

        // Buffer to store current largest value
        const buffer = data[0].data;
        const segmentation_data = segmentation.data;
        for (let j = 1; j < data.dims[0]; ++j) {
            const row = data[j].data;
            for (let k = 0; k < row.length; ++k) {
                if (row[k] > buffer[k]) {
                    buffer[k] = row[k];
                    segmentation_data[k] = j;
                }
            }
        }

        // Store which objects have labels
        // This is much more efficient that creating a set of the final values
        const hasLabel = new Array(data.dims[0]);
        for (let j = 0; j < segmentation_data.length; ++j) {
            const index = segmentation_data[j];
            hasLabel[index] = index;
        }
        /** @type {number[]} The unique list of labels that were detected */
        const labels = hasLabel.filter(x => x !== undefined);

        toReturn.push({ segmentation, labels });
    }
    return toReturn;
}


/**
 * Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and `labels`.
 * @param {Tensor} class_logits The class logits.
 * @param {Tensor} mask_logits The mask logits.
 * @param {number} object_mask_threshold A number between 0 and 1 used to binarize the masks.
 * @param {number} num_labels The number of labels.
 * @returns {[Tensor[], number[], number[]]} The binarized masks, the scores, and the labels.
 * @private
 */
function remove_low_and_no_objects(class_logits, mask_logits, object_mask_threshold, num_labels) {

    const mask_probs_item = [];
    const pred_scores_item = [];
    const pred_labels_item = [];

    for (let j = 0; j < class_logits.dims[0]; ++j) {
        const cls = class_logits[j];
        const mask = mask_logits[j];

        const pred_label = max(cls.data)[1];
        if (pred_label === num_labels) {
            // Is the background, so we ignore it
            continue;
        }

        const scores = softmax(cls.data);
        const pred_score = scores[pred_label];
        if (pred_score > object_mask_threshold) {
            mask_probs_item.push(mask);
            pred_scores_item.push(pred_score);
            pred_labels_item.push(pred_label);
        }
    }

    return [mask_probs_item, pred_scores_item, pred_labels_item];
}

/**
 * Checks whether the segment is valid or not.
 * @param {Int32Array} mask_labels Labels for each pixel in the mask.
 * @param {Tensor[]} mask_probs Probabilities for each pixel in the masks.
 * @param {number} k The class id of the segment.
 * @param {number} mask_threshold The mask threshold.
 * @param {number} overlap_mask_area_threshold The overlap mask area threshold.
 * @returns {[boolean, number[]]} Whether the segment is valid or not, and the indices of the valid labels.
 * @private
 */
function check_segment_validity(
    mask_labels,
    mask_probs,
    k,
    mask_threshold = 0.5,
    overlap_mask_area_threshold = 0.8
) {
    // mask_k is a 1D array of indices, indicating where the mask is equal to k
    const mask_k = [];
    let mask_k_area = 0;
    let original_area = 0;

    const mask_probs_k_data = mask_probs[k].data;

    // Compute the area of all the stuff in query k
    for (let i = 0; i < mask_labels.length; ++i) {
        if (mask_labels[i] === k) {
            mask_k.push(i);
            ++mask_k_area;
        }

        if (mask_probs_k_data[i] >= mask_threshold) {
            ++original_area;
        }
    }
    let mask_exists = mask_k_area > 0 && original_area > 0;

    // Eliminate disconnected tiny segments
    if (mask_exists) {
        // Perform additional check
        let area_ratio = mask_k_area / original_area;
        mask_exists = area_ratio > overlap_mask_area_threshold;
    }

    return [mask_exists, mask_k]
}

/**
 * Computes the segments.
 * @param {Tensor[]} mask_probs The mask probabilities.
 * @param {number[]} pred_scores The predicted scores.
 * @param {number[]} pred_labels The predicted labels.
 * @param {number} mask_threshold The mask threshold.
 * @param {number} overlap_mask_area_threshold The overlap mask area threshold.
 * @param {Set<number>} label_ids_to_fuse The label ids to fuse.
 * @param {number[]} target_size The target size of the image.
 * @returns {[Tensor, Array<{id: number, label_id: number, score: number}>]} The computed segments.
 * @private
 */
function compute_segments(
    mask_probs,
    pred_scores,
    pred_labels,
    mask_threshold,
    overlap_mask_area_threshold,
    label_ids_to_fuse = null,
    target_size = null,
) {
    const [height, width] = target_size ?? mask_probs[0].dims;

    const segmentation = new Tensor(
        'int32',
        new Int32Array(height * width),
        [height, width]
    );
    const segments = [];

    // 1. If target_size is not null, we need to resize the masks to the target size
    if (target_size !== null) {
        // resize the masks to the target size
        for (let i = 0; i < mask_probs.length; ++i) {
            mask_probs[i] = interpolate(mask_probs[i], target_size, 'bilinear', false);
        }
    }

    // 2. Weigh each mask by its prediction score
    // NOTE: `mask_probs` is updated in-place
    // 
    // Temporary storage for the best label/scores for each pixel ([height, width]):
    const mask_labels = new Int32Array(mask_probs[0].data.length);
    const bestScores = new Float32Array(mask_probs[0].data.length);

    for (let i = 0; i < mask_probs.length; ++i) {
        let score = pred_scores[i];

        const mask_probs_i_data = mask_probs[i].data;

        for (let j = 0; j < mask_probs_i_data.length; ++j) {
            mask_probs_i_data[j] *= score
            if (mask_probs_i_data[j] > bestScores[j]) {
                mask_labels[j] = i;
                bestScores[j] = mask_probs_i_data[j];
            }
        }
    }

    let current_segment_id = 0;

    // let stuff_memory_list = {}
    const segmentation_data = segmentation.data;
    for (let k = 0; k < pred_labels.length; ++k) {
        const pred_class = pred_labels[k];

        // TODO add `should_fuse`
        // let should_fuse = pred_class in label_ids_to_fuse

        // Check if mask exists and large enough to be a segment
        const [mask_exists, mask_k] = check_segment_validity(
            mask_labels,
            mask_probs,
            k,
            mask_threshold,
            overlap_mask_area_threshold
        )

        if (!mask_exists) {
            // Nothing to see here
            continue;
        }

        // TODO
        // if (pred_class in stuff_memory_list) {
        //     current_segment_id = stuff_memory_list[pred_class]
        // } else {
        //     current_segment_id += 1;
        // }
        ++current_segment_id;


        // Add current object segment to final segmentation map
        for (const index of mask_k) {
            segmentation_data[index] = current_segment_id;
        }

        segments.push({
            id: current_segment_id,
            label_id: pred_class,
            // was_fused: should_fuse, TODO
            score: pred_scores[k],
        })

        // TODO
        // if(should_fuse){
        //     stuff_memory_list[pred_class] = current_segment_id
        // }
    }

    return [segmentation, segments];
}

/**
 * Rescales the image so that the following conditions are met:
 *
 * 1. Both dimensions (height and width) are divisible by 'factor'.
 * 2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
 * 3. The aspect ratio of the image is maintained as closely as possible.
 * 
 * @param {number} height The height of the image.
 * @param {number} width The width of the image.
 * @param {number} [factor=28] The factor to use for resizing.
 * @param {number} [min_pixels=56*56] The minimum number of pixels.
 * @param {number} [max_pixels=14*14*4*1280] The maximum number of pixels.
 * @returns {[number, number]} The new height and width of the image.
 * @throws {Error} If the height or width is smaller than the factor.
 */
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {

    if (height < factor || width < factor) {
        throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
    } else if (Math.max(height, width) / Math.min(height, width) > 200) {
        throw new Error(
            `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
        );
    }

    let h_bar = Math.round(height / factor) * factor;
    let w_bar = Math.round(width / factor) * factor;

    if (h_bar * w_bar > max_pixels) {
        const beta = Math.sqrt((height * width) / max_pixels);
        h_bar = Math.floor((height / beta) / factor) * factor;
        w_bar = Math.floor((width / beta) / factor) * factor;
    } else if (h_bar * w_bar < min_pixels) {
        const beta = Math.sqrt(min_pixels / (height * width));
        h_bar = Math.ceil((height * beta) / factor) * factor;
        w_bar = Math.ceil((width * beta) / factor) * factor;
    }

    return [h_bar, w_bar];
}


/**
 * Post-process the model output to generate the final panoptic segmentation.
 * @param {*} outputs The model output to post process
 * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
 * @param {number} [mask_threshold=0.5] Threshold to use when turning the predicted masks into binary values.
 * @param {number} [overlap_mask_area_threshold=0.8] The overlap mask area threshold to merge or discard small disconnected parts within each binary instance mask.
 * @param {Set<number>} [label_ids_to_fuse=null] The labels in this state will have all their instances be fused together.
 * @param {[number, number][]} [target_sizes=null] The target sizes to resize the masks to.
 * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
 */
export function post_process_panoptic_segmentation(
    outputs,
    threshold = 0.5,
    mask_threshold = 0.5,
    overlap_mask_area_threshold = 0.8,
    label_ids_to_fuse = null,
    target_sizes = null,
) {
    if (label_ids_to_fuse === null) {
        console.warn("`label_ids_to_fuse` unset. No instance will be fused.")
        label_ids_to_fuse = new Set();
    }

    const class_queries_logits = outputs.class_queries_logits ?? outputs.logits; // [batch_size, num_queries, num_classes+1]
    const masks_queries_logits = outputs.masks_queries_logits ?? outputs.pred_masks; // [batch_size, num_queries, height, width]

    const mask_probs = masks_queries_logits.sigmoid()  // [batch_size, num_queries, height, width]

    let [batch_size, num_queries, num_labels] = class_queries_logits.dims;
    num_labels -= 1; // Remove last class (background)

    if (target_sizes !== null && target_sizes.length !== batch_size) {
        throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits")
    }

    let toReturn = [];
    for (let i = 0; i < batch_size; ++i) {
        let target_size = target_sizes !== null ? target_sizes[i] : null;

        let class_logits = class_queries_logits[i];
        let mask_logits = mask_probs[i];

        let [mask_probs_item, pred_scores_item, pred_labels_item] = remove_low_and_no_objects(class_logits, mask_logits, threshold, num_labels);

        if (pred_labels_item.length === 0) {
            // No mask found
            let [height, width] = target_size ?? mask_logits.dims.slice(-2);

            let segmentation = new Tensor(
                'int32',
                new Int32Array(height * width).fill(-1),
                [height, width]
            )
            toReturn.push({
                segmentation: segmentation,
                segments_info: []
            });
            continue;
        }


        // Get segmentation map and segment information of batch item
        let [segmentation, segments] = compute_segments(
            mask_probs_item,
            pred_scores_item,
            pred_labels_item,
            mask_threshold,
            overlap_mask_area_threshold,
            label_ids_to_fuse,
            target_size,
        )

        toReturn.push({
            segmentation: segmentation,
            segments_info: segments
        })
    }

    return toReturn;
}


/**
 * Post-processes the outputs of the model (for instance segmentation).
 * @param {*} outputs Raw outputs of the model.
 * @param {number} [threshold=0.5] The probability score threshold to keep predicted instance masks.
 * @param {[number, number][]} [target_sizes=null] List of tuples corresponding to the requested final size
 * (height, width) of each prediction. If unset, predictions will not be resized.
 * @returns {Array<{ segmentation: Tensor, segments_info: Array<{id: number, label_id: number, score: number}>}>}
 */
export function post_process_instance_segmentation(outputs, threshold = 0.5, target_sizes = null) {
    throw new Error('`post_process_instance_segmentation` is not yet implemented.');
}


/**
 * @typedef {Object} ImageProcessorConfig A configuration object used to create an image processor.    
 * @property {function} [progress_callback=null] If specified, this function will be called during model construction, to provide the user with progress updates.
 * @property {number[]} [image_mean] The mean values for image normalization.
 * @property {number[]} [image_std] The standard deviation values for image normalization.
 * @property {boolean} [do_rescale] Whether to rescale the image pixel values to the [0,1] range.
 * @property {number} [rescale_factor] The factor to use for rescaling the image pixel values.
 * @property {boolean} [do_normalize] Whether to normalize the image pixel values.
 * @property {boolean} [do_resize] Whether to resize the image.
 * @property {number} [resample] What method to use for resampling.
 * @property {number|Object} [size] The size to resize the image to.
 * @property {number|Object} [image_size] The size to resize the image to (same as `size`).
 * @property {boolean} [do_flip_channel_order=false] Whether to flip the color channels from RGB to BGR.
 * Can be overridden by the `do_flip_channel_order` parameter in the `preprocess` method.
 * @property {boolean} [do_center_crop] Whether to center crop the image to the specified `crop_size`.
 * Can be overridden by `do_center_crop` in the `preprocess` method.
 * @property {boolean} [do_thumbnail] Whether to resize the image using thumbnail method.
 * @property {boolean} [keep_aspect_ratio] If `true`, the image is resized to the largest possible size such that the aspect ratio is preserved.
 * Can be overidden by `keep_aspect_ratio` in `preprocess`.
 * @property {number} [ensure_multiple_of] If `do_resize` is `true`, the image is resized to a size that is a multiple of this value.
 * Can be overidden by `ensure_multiple_of` in `preprocess`.
 * 
 * @property {number[]} [mean] The mean values for image normalization (same as `image_mean`).
 * @property {number[]} [std] The standard deviation values for image normalization (same as `image_std`).
 */

export class ImageProcessor extends Callable {

    /**
     * Constructs a new `ImageProcessor`.
     * @param {ImageProcessorConfig} config The configuration object.
     */
    constructor(config) {
        super();

        this.image_mean = config.image_mean ?? config.mean;
        this.image_std = config.image_std ?? config.std;

        this.resample = config.resample ?? 2; // 2 => bilinear
        this.do_rescale = config.do_rescale ?? true;
        this.rescale_factor = config.rescale_factor ?? (1 / 255);
        this.do_normalize = config.do_normalize;

        this.do_thumbnail = config.do_thumbnail;
        this.size = config.size ?? config.image_size;
        this.do_resize = config.do_resize ?? (this.size !== undefined);
        // @ts-expect-error TS2339
        this.size_divisibility = config.size_divisibility ?? config.size_divisor;

        this.do_center_crop = config.do_center_crop;
        // @ts-expect-error TS2339
        this.crop_size = config.crop_size;
        // @ts-expect-error TS2339
        this.do_convert_rgb = config.do_convert_rgb ?? true;
        // @ts-expect-error TS2339
        this.do_crop_margin = config.do_crop_margin;

        // @ts-expect-error TS2339
        this.pad_size = config.pad_size;
        // @ts-expect-error TS2339
        this.do_pad = config.do_pad;
        // @ts-expect-error TS2339
        this.min_pixels = config.min_pixels;
        // @ts-expect-error TS2339
        this.max_pixels = config.max_pixels;

        if (this.do_pad && !this.pad_size && this.size && this.size.width !== undefined && this.size.height !== undefined) {
            // Should pad, but no pad size specified
            // We infer the pad size from the resize size
            this.pad_size = this.size
        }

        this.do_flip_channel_order = config.do_flip_channel_order ?? false;

        this.config = config;
    }

    /**
     * Resize the image to make a thumbnail. The image is resized so that no dimension is larger than any
     * corresponding dimension of the specified size.
     * @param {RawImage} image The image to be resized.
     * @param {{height:number, width:number}} size The size `{"height": h, "width": w}` to resize the image to.
     * @param {string | 0 | 1 | 2 | 3 | 4 | 5} [resample=2] The resampling filter to use.
     * @returns {Promise<RawImage>} The resized image.
     */
    async thumbnail(image, size, resample = 2) {
        const input_height = image.height;
        const input_width = image.width;

        const output_height = size.height;
        const output_width = size.width;

        // We always resize to the smallest of either the input or output size.
        let height = Math.min(input_height, output_height)
        let width = Math.min(input_width, output_width)

        if (height === input_height && width === input_width) {
            return image;
        }
        if (input_height > input_width) {
            width = Math.floor(input_width * height / input_height);
        } else if (input_width > input_height) {
            height = Math.floor(input_height * width / input_width);
        }
        return await image.resize(width, height, { resample });
    }


    /**
     * Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold).
     * @param {RawImage} image The image to be cropped.
     * @param {number} gray_threshold Value below which pixels are considered to be gray.
     * @returns {Promise<RawImage>} The cropped image.
     */
    async crop_margin(image, gray_threshold = 200) {

        const gray_image = image.clone().grayscale();

        const minValue = min(gray_image.data)[0];
        const maxValue = max(gray_image.data)[0];
        const diff = maxValue - minValue;

        if (diff === 0) {
            return image;
        }

        const threshold = gray_threshold / 255;

        let x_min = gray_image.width, y_min = gray_image.height, x_max = 0, y_max = 0;
        const gray_image_data = gray_image.data;
        for (let j = 0; j < gray_image.height; ++j) {
            const row = j * gray_image.width;
            for (let i = 0; i < gray_image.width; ++i) {
                if ((gray_image_data[row + i] - minValue) / diff < threshold) {
                    // We have a non-zero pixel, so we update the min/max values accordingly
                    x_min = Math.min(x_min, i);
                    y_min = Math.min(y_min, j);
                    x_max = Math.max(x_max, i);
                    y_max = Math.max(y_max, j);
                }
            }
        }

        image = await image.crop([x_min, y_min, x_max, y_max]);
        return image;
    }

    /**
     * Pad the image by a certain amount.
     * @param {Float32Array} pixelData The pixel data to pad.
     * @param {number[]} imgDims The dimensions of the image (height, width, channels).
     * @param {{width:number; height:number}|number|'square'} padSize The dimensions of the padded image.
     * @param {Object} options The options for padding.
     * @param {'constant'|'symmetric'} [options.mode='constant'] The type of padding to add.
     * @param {boolean} [options.center=false] Whether to center the image.
     * @param {number|number[]} [options.constant_values=0] The constant value to use for padding.
     * @returns {[Float32Array, number[]]} The padded pixel data and image dimensions.
     */
    pad_image(pixelData, imgDims, padSize, {
        mode = 'constant',
        center = false,
        constant_values = 0,
    } = {}) {
        const [imageHeight, imageWidth, imageChannels] = imgDims;

        let paddedImageWidth, paddedImageHeight;
        if (typeof padSize === 'number') {
            paddedImageWidth = padSize;
            paddedImageHeight = padSize;
        } else if (padSize === 'square') {
            paddedImageWidth = paddedImageHeight = Math.max(imageHeight, imageWidth);
        } else {
            paddedImageWidth = padSize.width;
            paddedImageHeight = padSize.height;
        }

        // Only add padding if there is a difference in size
        if (paddedImageWidth !== imageWidth || paddedImageHeight !== imageHeight) {
            const paddedPixelData = new Float32Array(paddedImageWidth * paddedImageHeight * imageChannels);
            if (Array.isArray(constant_values)) {
                // Fill with constant values, cycling through the array
                for (let i = 0; i < paddedPixelData.length; ++i) {
                    paddedPixelData[i] = constant_values[i % imageChannels];
                }
            } else if (constant_values !== 0) {
                paddedPixelData.fill(constant_values);
            }

            const [left, top] = center
                ? [Math.floor((paddedImageWidth - imageWidth) / 2), Math.floor((paddedImageHeight - imageHeight) / 2)]
                : [0, 0];

            // Copy the original image into the padded image
            for (let i = 0; i < imageHeight; ++i) {
                const a = (i + top) * paddedImageWidth;
                const b = i * imageWidth;
                for (let j = 0; j < imageWidth; ++j) {
                    const c = (a + j + left) * imageChannels;
                    const d = (b + j) * imageChannels;
                    for (let k = 0; k < imageChannels; ++k) {
                        paddedPixelData[c + k] = pixelData[d + k];
                    }
                }
            }

            if (mode === 'symmetric') {
                if (center) {
                    throw new Error('`center` padding is not supported when `mode` is set to `symmetric`.');
                    // TODO: Implement this
                }
                const h1 = imageHeight - 1;
                const w1 = imageWidth - 1;
                for (let i = 0; i < paddedImageHeight; ++i) {
                    const a = i * paddedImageWidth;
                    const b = calculateReflectOffset(i, h1) * imageWidth;

                    for (let j = 0; j < paddedImageWidth; ++j) {
                        if (i < imageHeight && j < imageWidth) continue; // Do not overwrite original image
                        const c = (a + j) * imageChannels;
                        const d = (b + calculateReflectOffset(j, w1)) * imageChannels;

                        // Copy channel-wise
                        for (let k = 0; k < imageChannels; ++k) {
                            paddedPixelData[c + k] = pixelData[d + k];
                        }
                    }
                }
            }


            // Update pixel data and image dimensions
            pixelData = paddedPixelData;
            imgDims = [paddedImageHeight, paddedImageWidth, imageChannels]
        }
        return [pixelData, imgDims];
    }

    /**
     * Rescale the image' pixel values by `this.rescale_factor`.
     * @param {Float32Array} pixelData The pixel data to rescale.
     * @returns {void}
     */
    rescale(pixelData) {
        for (let i = 0; i < pixelData.length; ++i) {
            pixelData[i] = this.rescale_factor * pixelData[i];
        }
    }

    /**
     * Find the target (width, height) dimension of the output image after
     * resizing given the input image and the desired size.
     * @param {RawImage} image The image to resize.
     * @param {any} size The size to use for resizing the image. 
     * @returns {[number, number]} The target (width, height) dimension of the output image after resizing.
     */
    get_resize_output_image_size(image, size) {
        // `size` comes in many forms, so we need to handle them all here:
        // 1. `size` is an integer, in which case we resize the image to be a square 

        const [srcWidth, srcHeight] = image.size;

        let shortest_edge;
        let longest_edge;

        if (this.do_thumbnail) {
            // NOTE: custom logic for `Donut` models
            const { height, width } = size;
            shortest_edge = Math.min(height, width)
        }
        // Support both formats for backwards compatibility
        else if (Number.isInteger(size)) {
            shortest_edge = size;
            // @ts-expect-error TS2339
            longest_edge = this.config.max_size ?? shortest_edge;

        } else if (size !== undefined) {
            // Extract known properties from `size`
            shortest_edge = size.shortest_edge;
            longest_edge = size.longest_edge;
        }

        // If `longest_edge` and `shortest_edge` are set, maintain aspect ratio and resize to `shortest_edge`
        // while keeping the largest dimension <= `longest_edge`
        if (shortest_edge !== undefined || longest_edge !== undefined) {
            // http://opensourcehacker.com/2011/12/01/calculate-aspect-ratio-conserving-resize-for-images-in-javascript/
            // Try resize so that shortest edge is `shortest_edge` (target)
            const shortResizeFactor = shortest_edge === undefined
                ? 1 // If `shortest_edge` is not set, don't upscale
                : Math.max(shortest_edge / srcWidth, shortest_edge / srcHeight);

            const newWidth = srcWidth * shortResizeFactor;
            const newHeight = srcHeight * shortResizeFactor;

            // The new width and height might be greater than `longest_edge`, so
            // we downscale again to ensure the largest dimension is `longest_edge` 
            const longResizeFactor = longest_edge === undefined
                ? 1 // If `longest_edge` is not set, don't downscale
                : Math.min(longest_edge / newWidth, longest_edge / newHeight);

            // To avoid certain floating point precision issues, we round to 2 decimal places
            let finalWidth = Math.floor(Number((newWidth * longResizeFactor).toFixed(2)));
            let finalHeight = Math.floor(Number((newHeight * longResizeFactor).toFixed(2)));

            if (this.size_divisibility !== undefined) {
                [finalWidth, finalHeight] = enforce_size_divisibility([finalWidth, finalHeight], this.size_divisibility)
            }
            return [finalWidth, finalHeight];

        } else if (size !== undefined && size.width !== undefined && size.height !== undefined) {
            // If `width` and `height` are set, resize to those dimensions

            let newWidth = size.width;
            let newHeight = size.height;

            // Custom for DPT models
            if (this.config.keep_aspect_ratio && this.config.ensure_multiple_of) {

                // determine new height and width
                let scale_height = newHeight / srcHeight;
                let scale_width = newWidth / srcWidth;

                // scale as little as possible
                if (Math.abs(1 - scale_width) < Math.abs(1 - scale_height)) {
                    // fit width
                    scale_height = scale_width;
                } else {
                    // fit height
                    scale_width = scale_height;
                }

                newHeight = constraint_to_multiple_of(scale_height * srcHeight, this.config.ensure_multiple_of);
                newWidth = constraint_to_multiple_of(scale_width * srcWidth, this.config.ensure_multiple_of);
            }

            return [newWidth, newHeight];

        } else if (this.size_divisibility !== undefined) {
            return enforce_size_divisibility([srcWidth, srcHeight], this.size_divisibility);
        } else if (this.min_pixels !== undefined && this.max_pixels !== undefined) {
            // Custom resize logic for Qwen2-VL models
            // @ts-expect-error TS2339
            const factor = this.config.patch_size * this.config.merge_size;
            return smart_resize(srcHeight, srcWidth, factor, this.min_pixels, this.max_pixels);
        } else {
            throw new Error(`Could not resize image due to unsupported \`this.size\` option in config: ${JSON.stringify(size)}`);
        }
    }

    /**
     * Resizes the image.
     * @param {RawImage} image The image to resize.
     * @returns {Promise<RawImage>} The resized image.
     */
    async resize(image) {
        const [newWidth, newHeight] = this.get_resize_output_image_size(image, this.size);
        return await image.resize(newWidth, newHeight, {
            // @ts-expect-error TS2322
            resample: this.resample,
        });
    }

    /**
     * @typedef {object} PreprocessedImage
     * @property {HeightWidth} original_size The original size of the image.
     * @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
     * @property {Tensor} pixel_values The pixel values of the preprocessed image.
     */

    /**
     * Preprocesses the given image.
     *
     * @param {RawImage} image The image to preprocess.
     * @param {Object} overrides The overrides for the preprocessing options.
     * @returns {Promise<PreprocessedImage>} The preprocessed image.
     */
    async preprocess(image, {
        do_normalize = null,
        do_pad = null,
        do_convert_rgb = null,
        do_convert_grayscale = null,
        do_flip_channel_order = null,
    } = {}) {
        if (this.do_crop_margin) {
            // NOTE: Specific to nougat processors. This is done before resizing,
            // and can be interpreted as a pre-preprocessing step.
            image = await this.crop_margin(image);
        }

        const [srcWidth, srcHeight] = image.size; // original image size

        // Convert image to RGB if specified in config.
        if (do_convert_rgb ?? this.do_convert_rgb) {
            image = image.rgb();
        } else if (do_convert_grayscale) {
            image = image.grayscale();
        }

        // TODO:
        // For efficiency reasons, it might be best to merge the resize and center crop operations into one.

        // Resize all images
        if (this.do_resize) {
            image = await this.resize(image);
        }

        // Resize the image using thumbnail method.
        if (this.do_thumbnail) {
            // @ts-expect-error TS2345
            image = await this.thumbnail(image, this.size, this.resample);
        }

        if (this.do_center_crop) {

            let crop_width;
            let crop_height;
            if (Number.isInteger(this.crop_size)) {
                crop_width = this.crop_size;
                crop_height = this.crop_size;
            } else {
                crop_width = this.crop_size.width;
                crop_height = this.crop_size.height;
            }

            image = await image.center_crop(crop_width, crop_height);
        }

        /** @type {HeightWidth} */
        const reshaped_input_size = [image.height, image.width];

        // NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
        // occurs with data in the hwc format (height, width, channels), 
        // to emulate the behavior of the original Python code (w/ numpy).
        /** @type {Float32Array} */
        let pixelData = Float32Array.from(image.data);
        let imgDims = [image.height, image.width, image.channels];

        if (this.do_rescale) {
            this.rescale(pixelData);
        }

        if (do_normalize ?? this.do_normalize) {
            let image_mean = this.image_mean;
            if (!Array.isArray(this.image_mean)) {
                image_mean = new Array(image.channels).fill(image_mean);
            }

            let image_std = this.image_std;
            if (!Array.isArray(this.image_std)) {
                image_std = new Array(image.channels).fill(image_mean);
            }

            if (image_mean.length !== image.channels || image_std.length !== image.channels) {
                throw new Error(`When set to arrays, the length of \`image_mean\` (${image_mean.length}) and \`image_std\` (${image_std.length}) must match the number of channels in the image (${image.channels}).`);
            }

            for (let i = 0; i < pixelData.length; i += image.channels) {
                for (let j = 0; j < image.channels; ++j) {
                    pixelData[i + j] = (pixelData[i + j] - image_mean[j]) / image_std[j];
                }
            }
        }

        // do padding after rescaling/normalizing
        if (do_pad ?? this.do_pad) {
            if (this.pad_size) {
                const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
                [pixelData, imgDims] = padded; // Update pixel data and image dimensions
            } else if (this.size_divisibility) {
                const [paddedWidth, paddedHeight] = enforce_size_divisibility([imgDims[1], imgDims[0]], this.size_divisibility);
                [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
            }
        }

        if (do_flip_channel_order ?? this.do_flip_channel_order) {
            if (imgDims[2] !== 3) {
                throw new Error('Flipping channel order is only supported for RGB images.');
            }
            // Convert RGB to BGR
            for (let i = 0; i < pixelData.length; i += 3) {
                const temp = pixelData[i];
                pixelData[i] = pixelData[i + 2];
                pixelData[i + 2] = temp;
            }
        }

        const pixel_values = new Tensor('float32', pixelData, imgDims)
            .permute(2, 0, 1); // convert to channel dimension format (hwc -> chw)

        return {
            original_size: [srcHeight, srcWidth],
            reshaped_input_size: reshaped_input_size,
            pixel_values,
        }
    }

    /**
     * Calls the feature extraction process on an array of images,
     * preprocesses each image, and concatenates the resulting
     * features into a single Tensor.
     * @param {RawImage[]} images The image(s) to extract features from.
     * @param {...any} args Additional arguments.
     * @returns {Promise<ImageProcessorResult>} An object containing the concatenated pixel values (and other metadata) of the preprocessed images.
     */
    async _call(images, ...args) {
        if (!Array.isArray(images)) {
            images = [images];
        }
        /** @type {PreprocessedImage[]} */
        const imageData = await Promise.all(images.map(x => this.preprocess(x)));

        // Stack pixel values
        const pixel_values = stack(imageData.map(x => x.pixel_values), 0);

        return {
            pixel_values,

            // Original sizes of images
            original_sizes: imageData.map(x => x.original_size),

            // Reshaped sizes of images, before padding or cropping
            reshaped_input_sizes: imageData.map(x => x.reshaped_input_size),
        }
    }


    /**
     * Instantiate one of the processor classes of the library from a pretrained model.
     * 
     * The processor class to instantiate is selected based on the `image_processor_type` (or `feature_extractor_type`; legacy)
     * property of the config object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible)
     * 
     * @param {string} pretrained_model_name_or_path The name or path of the pretrained model. Can be either:
     * - A string, the *model id* of a pretrained processor hosted inside a model repo on huggingface.co.
     *   Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
     *   user or organization name, like `dbmdz/bert-base-german-cased`.
     * - A path to a *directory* containing processor files, e.g., `./my_model_directory/`.
     * @param {import('../utils/hub.js').PretrainedOptions} options Additional options for loading the processor.
     * 
     * @returns {Promise<ImageProcessor>} A new instance of the Processor class.
     */
    static async from_pretrained(pretrained_model_name_or_path, options={}) {
        const preprocessorConfig = await getModelJSON(pretrained_model_name_or_path, IMAGE_PROCESSOR_NAME, true, options);
        return new this(preprocessorConfig);
    }
}