Spaces:
Running
Running
| import { Processor } from "../../base/processing_utils.js"; | |
| import { AutoImageProcessor } from "../auto/image_processing_auto.js"; | |
| import { AutoTokenizer } from "../../tokenizers.js"; | |
| import { RawImage } from "../../utils/image.js"; | |
| import { count } from "../../utils/core.js"; | |
| /** | |
| * Prompt with expanded image tokens for when the image is split into patches. | |
| * @private | |
| */ | |
| function _prompt_split_image(image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token) { | |
| let text_split_images = ""; | |
| for (let n_h = 0; n_h < image_rows; ++n_h) { | |
| for (let n_w = 0; n_w < image_cols; ++n_w) { | |
| text_split_images += ( | |
| fake_token_around_image + | |
| `<row_${n_h + 1}_col_${n_w + 1}>` + | |
| image_token.repeat(image_seq_len) | |
| ); | |
| } | |
| text_split_images += "\n"; | |
| } | |
| text_split_images += ( | |
| `\n${fake_token_around_image}` + | |
| `${global_img_token}` + | |
| image_token.repeat(image_seq_len) + | |
| `${fake_token_around_image}` | |
| ); | |
| return text_split_images; | |
| } | |
| /** | |
| * Prompt with expanded image tokens for a single image. | |
| * @private | |
| */ | |
| function _prompt_single_image(image_seq_len, fake_token_around_image, image_token, global_img_token) { | |
| return ( | |
| `${fake_token_around_image}` + | |
| `${global_img_token}` + | |
| image_token.repeat(image_seq_len) + | |
| `${fake_token_around_image}` | |
| ); | |
| } | |
| function get_image_prompt_string(image_rows, image_cols, image_seq_len, fake_token_around_image, image_token, global_img_token) { | |
| if (image_rows === 0 && image_cols === 0) { | |
| return _prompt_single_image( | |
| image_seq_len, | |
| fake_token_around_image, | |
| image_token, | |
| global_img_token | |
| ); | |
| } | |
| return _prompt_split_image( | |
| image_seq_len, image_rows, image_cols, fake_token_around_image, image_token, global_img_token | |
| ); | |
| } | |
| export class Idefics3Processor extends Processor { | |
| static image_processor_class = AutoImageProcessor | |
| static tokenizer_class = AutoTokenizer | |
| static uses_processor_config = true; | |
| fake_image_token = "<fake_token_around_image>"; | |
| image_token = "<image>"; | |
| global_img_token = "<global-img>"; | |
| /** | |
| * | |
| * @param {string|string[]} text | |
| * @param {RawImage|RawImage[]|RawImage[][]} images | |
| * @returns {Promise<any>} | |
| */ | |
| async _call(text, images = null, options = {}) { | |
| options.return_row_col_info ??= true; | |
| let image_inputs; | |
| if (images) { | |
| image_inputs = await this.image_processor(images, options); | |
| } | |
| // NOTE: We assume text is present | |
| if (!Array.isArray(text)) { | |
| text = [text]; | |
| } | |
| const image_rows = image_inputs.rows ?? [new Array(text.length).fill(0)]; | |
| const image_cols = image_inputs.cols ?? [new Array(text.length).fill(0)]; | |
| const image_seq_len = this.config.image_seq_len; | |
| const n_images_in_text = [] | |
| const prompt_strings = []; | |
| for (let i = 0; i < text.length; ++i) { | |
| const sample = text[i]; | |
| const sample_rows = image_rows[i]; | |
| const sample_cols = image_cols[i]; | |
| n_images_in_text.push(count(sample, this.image_token)); | |
| // Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` | |
| const image_prompt_strings = sample_rows.map( | |
| (n_rows, j) => get_image_prompt_string( | |
| n_rows, | |
| sample_cols[j], | |
| image_seq_len, | |
| this.fake_image_token, | |
| this.image_token, | |
| this.global_img_token, | |
| ) | |
| ); | |
| const split_sample = sample.split(this.image_token); | |
| if (split_sample.length === 0) { | |
| throw new Error("The image token should be present in the text."); | |
| } | |
| // Place in the image prompt strings where the image tokens are | |
| let new_sample = split_sample[0]; | |
| for (let j = 0; j < image_prompt_strings.length; ++j) { | |
| new_sample += image_prompt_strings[j] + split_sample[j + 1]; | |
| } | |
| prompt_strings.push(new_sample); | |
| } | |
| const text_inputs = this.tokenizer(prompt_strings); | |
| return { | |
| ...text_inputs, | |
| ...image_inputs, | |
| } | |
| } | |
| } | |