Spaces:
Running
Running
File size: 4,961 Bytes
ca97aa9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import { Idefics3Processor, Idefics3ForConditionalGeneration, RawImage } from "../../../src/transformers.js";
import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js";
export default () => {
const conversation = [
{
role: "user",
content: [{ type: "image" }, { type: "text", text: "Can you describe this image?" }],
},
];
// Empty white and black images
const white_image_dims = [224, 224, 3];
const white_image = new RawImage(new Uint8ClampedArray(white_image_dims[0] * white_image_dims[1] * white_image_dims[2]).fill(255), ...white_image_dims);
const black_image_dims = [720, 360, 3];
const black_image = new RawImage(new Uint8ClampedArray(black_image_dims[0] * black_image_dims[1] * black_image_dims[2]).fill(0), ...black_image_dims);
describe("Idefics3ForConditionalGeneration", () => {
const model_id = "hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration";
/** @type {Idefics3ForConditionalGeneration} */
let model;
/** @type {Idefics3Processor} */
let processor;
/** @type {string} */
let text;
beforeAll(async () => {
model = await Idefics3ForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS);
processor = await Idefics3Processor.from_pretrained(model_id);
text = processor.apply_chat_template(conversation, {
add_generation_prompt: true,
});
}, MAX_MODEL_LOAD_TIME);
it(
"forward w/ image splitting (default)",
async () => {
const inputs = await processor(text, white_image, {
do_image_splitting: true,
});
const { logits } = await model(inputs);
expect(logits.dims).toEqual([1, 3041, 128259]);
expect(logits.mean().item()).toBeCloseTo(-0.0002692154666874558, 6);
},
MAX_TEST_EXECUTION_TIME,
);
it(
"forward w/o image splitting",
async () => {
const inputs = await processor(text, white_image, {
do_image_splitting: false,
});
const { logits } = await model(inputs);
expect(logits.dims).toEqual([1, 189, 128259]);
expect(logits.mean().item()).toBeCloseTo(-0.00019743280427064747, 6);
},
MAX_TEST_EXECUTION_TIME,
);
it(
"batch_size=1 w/ image splitting",
async () => {
const inputs = await processor(text, white_image, {
do_image_splitting: true,
});
const generate_ids = await model.generate({
...inputs,
max_new_tokens: 10,
// To obtain unique output tokens, deterministically
repetition_penalty: 2.0,
});
expect(generate_ids.dims).toEqual([1, 3051]);
const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
expect(new_tokens.tolist()).toEqual([[64531n, 121777n, 70370n, 105334n, 12720n, 113356n, 47739n, 59240n, 102001n, 60344n]]);
},
MAX_TEST_EXECUTION_TIME,
);
it(
"batch_size=1 w/o image splitting",
async () => {
const inputs = await processor(text, white_image, {
do_image_splitting: false,
});
const generate_ids = await model.generate({
...inputs,
max_new_tokens: 10,
// To obtain unique output tokens, deterministically
repetition_penalty: 2.0,
});
expect(generate_ids.dims).toEqual([1, 199]);
const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
expect(new_tokens.tolist()).toEqual([[64531n, 121777n, 70370n, 105334n, 12720n, 113356n, 47739n, 59240n, 59697n, 65246n]]);
},
MAX_TEST_EXECUTION_TIME,
);
it(
"batch_size=1 multi-image w/o image splitting",
async () => {
const multi_image_conversation = [
{
role: "user",
content: [{ type: "image" }, { type: "image" }, { type: "text", text: "Can you describe these images?" }],
},
];
const multi_image_text = processor.apply_chat_template(multi_image_conversation, {
add_generation_prompt: true,
});
const inputs = await processor(multi_image_text, [white_image, black_image], {
do_image_splitting: false,
});
const generate_ids = await model.generate({
...inputs,
max_new_tokens: 10,
// To obtain unique output tokens, deterministically
repetition_penalty: 2.0,
});
expect(generate_ids.dims).toEqual([1, 374]);
const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
expect(new_tokens.tolist()).toEqual([[73189n, 99346n, 113252n, 51743n, 33499n, 66430n, 78739n, 89539n, 121023n, 14474n]]);
},
MAX_TEST_EXECUTION_TIME,
);
afterAll(async () => {
await model?.dispose();
}, MAX_MODEL_DISPOSE_TIME);
});
};
|