File size: 4,961 Bytes
ca97aa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import { Idefics3Processor, Idefics3ForConditionalGeneration, RawImage } from "../../../src/transformers.js";

import { MAX_MODEL_LOAD_TIME, MAX_TEST_EXECUTION_TIME, MAX_MODEL_DISPOSE_TIME, DEFAULT_MODEL_OPTIONS } from "../../init.js";

export default () => {
  const conversation = [
    {
      role: "user",
      content: [{ type: "image" }, { type: "text", text: "Can you describe this image?" }],
    },
  ];

  // Empty white and black images
  const white_image_dims = [224, 224, 3];
  const white_image = new RawImage(new Uint8ClampedArray(white_image_dims[0] * white_image_dims[1] * white_image_dims[2]).fill(255), ...white_image_dims);
  const black_image_dims = [720, 360, 3];
  const black_image = new RawImage(new Uint8ClampedArray(black_image_dims[0] * black_image_dims[1] * black_image_dims[2]).fill(0), ...black_image_dims);

  describe("Idefics3ForConditionalGeneration", () => {
    const model_id = "hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration";

    /** @type {Idefics3ForConditionalGeneration} */
    let model;
    /** @type {Idefics3Processor} */
    let processor;
    /** @type {string} */
    let text;
    beforeAll(async () => {
      model = await Idefics3ForConditionalGeneration.from_pretrained(model_id, DEFAULT_MODEL_OPTIONS);
      processor = await Idefics3Processor.from_pretrained(model_id);

      text = processor.apply_chat_template(conversation, {
        add_generation_prompt: true,
      });
    }, MAX_MODEL_LOAD_TIME);

    it(
      "forward w/ image splitting (default)",
      async () => {
        const inputs = await processor(text, white_image, {
          do_image_splitting: true,
        });

        const { logits } = await model(inputs);
        expect(logits.dims).toEqual([1, 3041, 128259]);
        expect(logits.mean().item()).toBeCloseTo(-0.0002692154666874558, 6);
      },
      MAX_TEST_EXECUTION_TIME,
    );

    it(
      "forward w/o image splitting",
      async () => {
        const inputs = await processor(text, white_image, {
          do_image_splitting: false,
        });

        const { logits } = await model(inputs);
        expect(logits.dims).toEqual([1, 189, 128259]);
        expect(logits.mean().item()).toBeCloseTo(-0.00019743280427064747, 6);
      },
      MAX_TEST_EXECUTION_TIME,
    );

    it(
      "batch_size=1 w/ image splitting",
      async () => {
        const inputs = await processor(text, white_image, {
          do_image_splitting: true,
        });
        const generate_ids = await model.generate({
          ...inputs,
          max_new_tokens: 10,

          // To obtain unique output tokens, deterministically
          repetition_penalty: 2.0,
        });
        expect(generate_ids.dims).toEqual([1, 3051]);

        const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
        expect(new_tokens.tolist()).toEqual([[64531n, 121777n, 70370n, 105334n, 12720n, 113356n, 47739n, 59240n, 102001n, 60344n]]);
      },
      MAX_TEST_EXECUTION_TIME,
    );

    it(
      "batch_size=1 w/o image splitting",
      async () => {
        const inputs = await processor(text, white_image, {
          do_image_splitting: false,
        });
        const generate_ids = await model.generate({
          ...inputs,
          max_new_tokens: 10,

          // To obtain unique output tokens, deterministically
          repetition_penalty: 2.0,
        });
        expect(generate_ids.dims).toEqual([1, 199]);

        const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
        expect(new_tokens.tolist()).toEqual([[64531n, 121777n, 70370n, 105334n, 12720n, 113356n, 47739n, 59240n, 59697n, 65246n]]);
      },
      MAX_TEST_EXECUTION_TIME,
    );

    it(
      "batch_size=1 multi-image w/o image splitting",
      async () => {
        const multi_image_conversation = [
          {
            role: "user",
            content: [{ type: "image" }, { type: "image" }, { type: "text", text: "Can you describe these images?" }],
          },
        ];

        const multi_image_text = processor.apply_chat_template(multi_image_conversation, {
          add_generation_prompt: true,
        });
        const inputs = await processor(multi_image_text, [white_image, black_image], {
          do_image_splitting: false,
        });
        const generate_ids = await model.generate({
          ...inputs,
          max_new_tokens: 10,

          // To obtain unique output tokens, deterministically
          repetition_penalty: 2.0,
        });
        expect(generate_ids.dims).toEqual([1, 374]);

        const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
        expect(new_tokens.tolist()).toEqual([[73189n, 99346n, 113252n, 51743n, 33499n, 66430n, 78739n, 89539n, 121023n, 14474n]]);
      },
      MAX_TEST_EXECUTION_TIME,
    );

    afterAll(async () => {
      await model?.dispose();
    }, MAX_MODEL_DISPOSE_TIME);
  });
};