Spaces:
Running
Running
File size: 1,538 Bytes
ca97aa9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import { Processor } from "../../base/processing_utils.js";
import { AutoImageProcessor } from "../auto/image_processing_auto.js";
import { AutoTokenizer } from "../../tokenizers.js";
export class LlavaProcessor extends Processor {
static tokenizer_class = AutoTokenizer
static image_processor_class = AutoImageProcessor
static uses_processor_config = true;
/**
* @typedef {import('../../utils/image.js').RawImage} RawImage
*/
// `images` is required, `text` is optional
async _call(/** @type {RawImage|RawImage[]} */ images, text = null, kwargs = {}) {
const image_inputs = await this.image_processor(images, kwargs);
if (text) {
const [height, width] = image_inputs.pixel_values.dims.slice(-2);
const {image_token, patch_size, num_additional_image_tokens} = this.config;
const num_image_tokens = Math.floor(
height / patch_size
) * Math.floor(width / patch_size) + num_additional_image_tokens;
text = structuredClone(text); // Avoid modifying the original text input
if (!Array.isArray(text)) {
text = [text];
}
for (let i = 0; i < text.length; ++i) {
text[i] = text[i].replace(image_token, image_token.repeat(num_image_tokens));
}
}
const text_inputs = text ? this.tokenizer(text, kwargs) : {};
return {
...image_inputs,
...text_inputs,
}
}
}
|