File size: 4,395 Bytes
ca97aa9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import { AutoImageProcessor, Idefics3ImageProcessor } from "../../../src/transformers.js";

import { load_cached_image } from "../../asset_cache.js";
import { MAX_PROCESSOR_LOAD_TIME, MAX_TEST_EXECUTION_TIME } from "../../init.js";

export default () => {
  // Idefics3ImageProcessor
  // - custom image processing (patching)
  describe("Idefics3ImageProcessor", () => {
    const model_id = "hf-internal-testing/tiny-random-Idefics3ForConditionalGeneration";

    /** @type {Record<string, import('../../../src/utils/image.js').RawImage>} */
    const images = {};
    /** @type {Idefics3ImageProcessor} */
    let processor;
    beforeAll(async () => {
      processor = await AutoImageProcessor.from_pretrained(model_id);

      // Load images
      const image = await load_cached_image("gradient_1280x640");
      const white_image = await load_cached_image("white_image");

      images.image = image;
      images.image_1 = await image.resize(1600, 1067);
      images.image_2 = await image.resize(224, 224);
      images.white_image = white_image;
      images.white_image_1 = await white_image.resize(1600, 1067);
      images.white_image_2 = await white_image.resize(224, 224);
    }, MAX_PROCESSOR_LOAD_TIME);

    it(
      "no image splitting",
      async () => {
        const { pixel_values, rows, cols } = await processor(images.image, { do_image_splitting: false, return_row_col_info: true });
        expect(pixel_values.dims).toEqual([1, 1, 3, 364, 364]);
        expect(pixel_values.mean().item()).toBeCloseTo(-0.001035306602716446, 2);
        expect(rows).toEqual([[0]]);
        expect(cols).toEqual([[0]]);
      },
      MAX_TEST_EXECUTION_TIME,
    );

    it(
      "batched no image splitting",
      async () => {
        const { pixel_values, pixel_attention_mask, rows, cols } = await processor([[images.white_image_1], [images.white_image_2], [images.white_image_1, images.white_image_2]], { do_image_splitting: false, return_row_col_info: true });
        expect(pixel_values.dims).toEqual([3, 2, 3, 364, 364]);
        expect(pixel_values.mean().item()).toBeCloseTo(2 / 3, 2);
        expect(pixel_attention_mask.dims).toEqual([3, 2, 364, 364]);
        expect(pixel_attention_mask.to("float32").mean().item()).toBeCloseTo(2 / 3, 3);
        expect(rows).toEqual([[0], [0], [0, 0]]);
        expect(cols).toEqual([[0], [0], [0, 0]]);

        // Test that the order of the pixel attention mask matches the python implementation
        expect(pixel_attention_mask.data.reduce((a, b, i) => a + i * b, 0)).toEqual(228217205216);
      },
      MAX_TEST_EXECUTION_TIME,
    );

    it(
      "correct patching",
      async () => {
        const { pixel_values, rows, cols } = await processor(images.image, { return_row_col_info: true });
        expect(pixel_values.dims).toEqual([1, 9, 3, 364, 364]);
        expect(pixel_values.flatten(2).mean(2).tolist()).toBeCloseToNested([[-0.7012196183204651, -0.30104631185531616, 0.09912905097007751, 0.49929487705230713, -0.5011996626853943, -0.10103467106819153, 0.2991456389427185, 0.6993265151977539, -0.0010353063698858023]], 1);
        expect(rows).toEqual([[2]]);
        expect(cols).toEqual([[4]]);
      },
      MAX_TEST_EXECUTION_TIME,
    );

    it(
      "unbatched, single image",
      async () => {
        const { pixel_values, rows, cols } = await processor(images.image_1, { return_row_col_info: true });
        expect(pixel_values.dims).toEqual([1, 13, 3, 364, 364]);

        expect(rows).toEqual([[3]]);
        expect(cols).toEqual([[4]]);
      },
      MAX_TEST_EXECUTION_TIME,
    );

    it(
      "unbatched, multiple images",
      async () => {
        const { pixel_values, rows, cols } = await processor([images.image_1, images.image_2], { return_row_col_info: true });
        expect(pixel_values.dims).toEqual([1, 30, 3, 364, 364]);

        expect(rows).toEqual([[3, 4]]);
        expect(cols).toEqual([[4, 4]]);
      },
      MAX_TEST_EXECUTION_TIME,
    );

    it(
      "batched, multiple images",
      async () => {
        const { pixel_values, rows, cols } = await processor([[images.image_1], [images.image_1, images.image_2]], { return_row_col_info: true });
        expect(pixel_values.dims).toEqual([2, 30, 3, 364, 364]);
        expect(rows).toEqual([[3], [3, 4]]);
        expect(cols).toEqual([[4], [4, 4]]);
      },
      MAX_TEST_EXECUTION_TIME,
    );
  });
};