Spaces:
Running
Running
File size: 1,706 Bytes
ca97aa9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import { Processor } from "../../base/processing_utils.js";
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
import { AutoTokenizer } from "../../tokenizers.js";
import { RawImage } from "../../utils/image.js";
export class Qwen2VLProcessor extends Processor {
static image_processor_class = AutoImageProcessor
static tokenizer_class = AutoTokenizer
/**
*
* @param {string|string[]} text
* @param {RawImage|RawImage[]} images
* @param {...any} args
* @returns {Promise<any>}
*/
async _call(text, images = null, ...args) {
if (!Array.isArray(text)) {
text = [text];
}
let image_inputs, image_grid_thw;
if (images) {
image_inputs = await this.image_processor(images);
image_grid_thw = image_inputs.image_grid_thw;
}
if (image_grid_thw) {
// @ts-expect-error TS2551
let merge_length = this.image_processor.config.merge_size ** 2;
let index = 0;
const image_grid_thw_list = image_grid_thw.tolist();
text = text.map(t => {
while (t.includes("<|image_pad|>")) {
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
}
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
});
}
const text_inputs = this.tokenizer(text);
return {
...text_inputs,
...image_inputs,
// TODO: ...videos_inputs,
}
}
}
|