tuandunghcmut commited on
Commit
0b87f0f
·
verified ·
1 Parent(s): 2422711

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. InternVL/clip_benchmark/AUTHORS.rst +6 -0
  2. InternVL/clip_benchmark/Makefile +91 -0
  3. InternVL/clip_benchmark/README.md +1293 -0
  4. InternVL/clip_benchmark/requirements.txt +16 -0
  5. InternVL/clip_benchmark/test_internvl_c_retrieval.sh +21 -0
  6. InternVL/clip_benchmark/test_internvl_c_xtd.sh +37 -0
  7. InternVL/clip_benchmark/test_internvl_g_classification.sh +90 -0
  8. InternVL/clip_benchmark/test_internvl_g_imagenet.sh +45 -0
  9. InternVL/clip_benchmark/tox.ini +19 -0
  10. InternVL/segmentation/configs/_base_/datasets/ade20k.py +56 -0
  11. InternVL/segmentation/configs/_base_/datasets/ade20k_504x504_1of2.py +56 -0
  12. InternVL/segmentation/configs/_base_/datasets/ade20k_504x504_1of4.py +56 -0
  13. InternVL/segmentation/configs/_base_/datasets/ade20k_504x504_1of8.py +56 -0
  14. InternVL/segmentation/configs/_base_/datasets/ade20k_640x640.py +54 -0
  15. InternVL/segmentation/configs/_base_/datasets/ade20k_896x896.py +54 -0
  16. InternVL/segmentation/configs/_base_/datasets/chase_db1.py +59 -0
  17. InternVL/segmentation/configs/_base_/datasets/cityscapes.py +54 -0
  18. InternVL/segmentation/configs/_base_/datasets/cityscapes_768x768.py +35 -0
  19. InternVL/segmentation/configs/_base_/datasets/cityscapes_769x769.py +35 -0
  20. InternVL/segmentation/configs/_base_/datasets/cityscapes_832x832.py +35 -0
  21. InternVL/segmentation/configs/_base_/datasets/coco-stuff10k.py +57 -0
  22. InternVL/segmentation/configs/_base_/datasets/coco-stuff164k.py +54 -0
  23. InternVL/segmentation/configs/_base_/datasets/coco-stuff164k_896x896.py +54 -0
  24. InternVL/segmentation/configs/_base_/datasets/drive.py +59 -0
  25. InternVL/segmentation/configs/_base_/datasets/hrf.py +59 -0
  26. InternVL/segmentation/configs/_base_/datasets/isaid.py +62 -0
  27. InternVL/segmentation/configs/_base_/datasets/loveda.py +54 -0
  28. InternVL/segmentation/configs/_base_/datasets/pascal_context.py +60 -0
  29. InternVL/segmentation/configs/_base_/datasets/pascal_context_59.py +60 -0
  30. InternVL/segmentation/configs/_base_/datasets/pascal_voc12.py +57 -0
  31. InternVL/segmentation/configs/_base_/datasets/pascal_voc12_aug.py +9 -0
  32. InternVL/segmentation/configs/_base_/datasets/potsdam.py +54 -0
  33. InternVL/segmentation/configs/_base_/datasets/stare.py +59 -0
  34. InternVL/segmentation/configs/_base_/datasets/vaihingen.py +54 -0
  35. InternVL/segmentation/configs/_base_/default_runtime.py +15 -0
  36. InternVL/segmentation/configs/_base_/models/ann_r50-d8.py +46 -0
  37. InternVL/segmentation/configs/_base_/models/bisenetv2.py +80 -0
  38. InternVL/segmentation/configs/_base_/models/ccnet_r50-d8.py +44 -0
  39. InternVL/segmentation/configs/_base_/models/cgnet.py +35 -0
  40. InternVL/segmentation/configs/_base_/models/deeplabv3_r50-d8.py +44 -0
  41. InternVL/segmentation/configs/_base_/models/deeplabv3_unet_s5-d16.py +50 -0
  42. InternVL/segmentation/configs/_base_/models/dnl_r50-d8.py +46 -0
  43. InternVL/segmentation/configs/_base_/models/dpt_vit-b16.py +31 -0
  44. InternVL/segmentation/configs/_base_/models/emanet_r50-d8.py +47 -0
  45. InternVL/segmentation/configs/_base_/models/fast_scnn.py +57 -0
  46. InternVL/segmentation/configs/_base_/models/fcn_r50-d8.py +45 -0
  47. InternVL/segmentation/configs/_base_/models/fcn_unet_s5-d16.py +51 -0
  48. InternVL/segmentation/configs/_base_/models/gcnet_r50-d8.py +46 -0
  49. InternVL/segmentation/configs/_base_/models/icnet_r50-d8.py +74 -0
  50. InternVL/segmentation/configs/_base_/models/mask2former_beit.py +138 -0
InternVL/clip_benchmark/AUTHORS.rst ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ =======
2
+ Credits
3
+ =======
4
+
5
+ * `Mehdi Cherti <https://github.com/mehdidc>`_
6
+ * `Romain Beaumont <https://github.com/rom1504>`_
InternVL/clip_benchmark/Makefile ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: clean clean-build clean-pyc clean-test coverage dist docs help install lint lint/flake8
2
+ .DEFAULT_GOAL := help
3
+
4
+ define BROWSER_PYSCRIPT
5
+ import os, webbrowser, sys
6
+
7
+ from urllib.request import pathname2url
8
+
9
+ webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
10
+ endef
11
+ export BROWSER_PYSCRIPT
12
+
13
+ define PRINT_HELP_PYSCRIPT
14
+ import re, sys
15
+
16
+ for line in sys.stdin:
17
+ match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
18
+ if match:
19
+ target, help = match.groups()
20
+ print("%-20s %s" % (target, help))
21
+ endef
22
+ export PRINT_HELP_PYSCRIPT
23
+
24
+ BROWSER := python -c "$$BROWSER_PYSCRIPT"
25
+
26
+ help:
27
+ @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
28
+
29
+ clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
30
+
31
+ clean-build: ## remove build artifacts
32
+ rm -fr build/
33
+ rm -fr dist/
34
+ rm -fr .eggs/
35
+ find . -name '*.egg-info' -exec rm -fr {} +
36
+ find . -name '*.egg' -exec rm -f {} +
37
+
38
+ clean-pyc: ## remove Python file artifacts
39
+ find . -name '*.pyc' -exec rm -f {} +
40
+ find . -name '*.pyo' -exec rm -f {} +
41
+ find . -name '*~' -exec rm -f {} +
42
+ find . -name '__pycache__' -exec rm -fr {} +
43
+
44
+ clean-test: ## remove test and coverage artifacts
45
+ rm -fr .tox/
46
+ rm -f .coverage
47
+ rm -fr htmlcov/
48
+ rm -fr .pytest_cache
49
+
50
+ lint/flake8: ## check style with flake8
51
+ flake8 clip_benchmark tests
52
+
53
+ lint: lint/flake8 ## check style
54
+
55
+ test-all: ## run tests on every Python version with tox
56
+ tox
57
+
58
+ coverage: ## check code coverage quickly with the default Python
59
+ coverage run --source clip_benchmark setup.py test
60
+ coverage report -m
61
+ coverage html
62
+ $(BROWSER) htmlcov/index.html
63
+
64
+ docs: ## generate Sphinx HTML documentation, including API docs
65
+ rm -f docs/clip_benchmark.rst
66
+ rm -f docs/modules.rst
67
+ sphinx-apidoc -o docs/ clip_benchmark
68
+ $(MAKE) -C docs clean
69
+ $(MAKE) -C docs html
70
+ $(BROWSER) docs/_build/html/index.html
71
+
72
+ servedocs: docs ## compile the docs watching for changes
73
+ watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
74
+
75
+ release: dist ## package and upload a release
76
+ twine upload dist/*
77
+
78
+ dist: clean ## builds source and wheel package
79
+ python setup.py sdist
80
+ python setup.py bdist_wheel
81
+ ls -l dist
82
+
83
+ install: ## [Local development] Upgrade pip, install requirements, install package.
84
+ python -m pip install -U pip
85
+ python -m pip install -e .
86
+
87
+ install-dev: ## [Local development] Install test requirements
88
+ python -m pip install -r requirements-test.txt
89
+
90
+ test: ## [Local development] Run unit tests
91
+ python -m pytest -x -s -v tests
InternVL/clip_benchmark/README.md ADDED
@@ -0,0 +1,1293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # InternVL for Zero-Shot Image Classification & Image-Text Retrieval
2
+
3
+ This folder contains the implementation of InternVL 1.0 for zero-shot image classification and zero-shot image-text retrieval, which corresponds to Section 4.3 of our [InternVL 1.0 paper](https://arxiv.org/pdf/2312.14238).
4
+ We mainly use [CLIP Benchmark](https://github.com/LAION-AI/CLIP_benchmark) to evaluate the performance of InternVL. Thanks for this great work.
5
+
6
+ ## 🛠️ Installation
7
+
8
+ First, follow the [installation guide](../INSTALLATION.md) to perform some basic installations.
9
+
10
+ In addition, using this codebase requires executing the following steps:
11
+
12
+ - Install other requirements:
13
+
14
+ ```bash
15
+ pip install -r requirements.txt
16
+ ```
17
+
18
+ - Install `clip_benchmark` using development mode:
19
+
20
+ ```bash
21
+ python setup.py develop
22
+ # You can also add the current directory to PYTHONPATH instead.
23
+ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
24
+ ```
25
+
26
+ ## 📦 Data Preparation
27
+
28
+ This codebase will automatically download the required dataset. If the dataset fails to download automatically, please refer to this [code](./clip_benchmark/datasets/builder.py) for manual downloading.
29
+
30
+ ## 📦 Model Preparation
31
+
32
+ | model name | type | download | size |
33
+ | ------------------------ | :---------: | ------------------------------------------------------------------------------------------ | :-----: |
34
+ | internvl_c_13b_224px.pth | pytorch | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL/blob/main/internvl_c_13b_224px.pth) | 25.4 GB |
35
+ | InternVL-14B-224px | huggingface | 🤗 [HF link](https://huggingface.co/OpenGVLab/InternVL-14B-224px) | 27.7 GB |
36
+
37
+ Please download the above model weights and place them in the `pretrained/` folder.
38
+
39
+ You can download either the PyTorch version or the Hugging Face version based on your needs.
40
+
41
+ ```sh
42
+ cd pretrained/
43
+ wget https://huggingface.co/OpenGVLab/InternVL/resolve/main/internvl_c_13b_224px.pth
44
+ # pip install -U huggingface_hub
45
+ huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/InternVL-14B-224px --local-dir InternVL-14B-224px
46
+ ```
47
+
48
+ The directory structure is:
49
+
50
+ ```sh
51
+ pretrained
52
+ ├── internvl_c_13b_224px.pth
53
+ └── InternVL-14B-224px/
54
+ ```
55
+
56
+ ## 📊 Evaluation: Zero-Shot Image Classification
57
+
58
+ ### ImageNet variants and ObjectNet
59
+
60
+ | model name | IN-1K | IN-A | IN-R | IN-V2 | IN-Sketch | ObjectNet | ∆ | average |
61
+ | :--------: | :---: | :--: | :--: | :---: | :-------: | :-------: | :-: | :-----: |
62
+ | InternVL-C | 83.2 | 83.8 | 95.5 | 77.3 | 73.9 | 80.6 | 0.8 | 82.4 |
63
+
64
+ <details>
65
+ <summary>[InternVL-C] ImageNet-1K val</summary>
66
+
67
+ ```bash
68
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
69
+ --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
70
+ --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
71
+ ```
72
+
73
+ Expected results:
74
+
75
+ ```
76
+ {"dataset": "imagenet1k", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
77
+ "metrics": {"acc1": 0.83178, "acc5": 0.97322, "mean_per_class_recall": 0.83204}, "language": "en"}
78
+ ```
79
+
80
+ </details>
81
+
82
+ <details>
83
+ <summary>[InternVL-C] ImageNet-A</summary>
84
+
85
+ ```bash
86
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
87
+ --task "zeroshot_classification" --dataset "imagenet-a" --dataset_root ./data/imagenet-a/ \
88
+ --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
89
+ ```
90
+
91
+ Expected results:
92
+
93
+ ```
94
+ {"dataset": "imagenet-a", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
95
+ "metrics": {"acc1": 0.8377333333333333, "acc5": 0.9558666666666666, "mean_per_class_recall": 0.8183934468491632}, "language": "en"}
96
+ ```
97
+
98
+ </details>
99
+
100
+ <details>
101
+ <summary>[InternVL-C] ImageNet-R</summary>
102
+
103
+ ```bash
104
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
105
+ --task "zeroshot_classification" --dataset "imagenet-r" --dataset_root ./data/imagenet-r/ \
106
+ --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
107
+ ```
108
+
109
+ Expected results:
110
+
111
+ ```
112
+ {"dataset": "imagenet-r", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
113
+ "metrics": {"acc1": 0.9549666666666666, "acc5": 0.9918333333333333, "mean_per_class_recall": 0.9460205918105684}, "language": "en"}
114
+ ```
115
+
116
+ </details>
117
+
118
+ <details>
119
+ <summary>[InternVL-C] ImageNet-V2</summary>
120
+
121
+ ```bash
122
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
123
+ --task "zeroshot_classification" --dataset "imagenetv2" --dataset_root ./data/imagenetv2/ \
124
+ --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
125
+ ```
126
+
127
+ Expected results:
128
+
129
+ ```
130
+ {"dataset": "imagenetv2", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
131
+ "metrics": {"acc1": 0.7726, "acc5": 0.9468, "mean_per_class_recall": 0.7738000000000001}, "language": "en"}
132
+ ```
133
+
134
+ </details>
135
+
136
+ <details>
137
+ <summary>[InternVL-C] ImageNet-Sketch</summary>
138
+
139
+ ```bash
140
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
141
+ --task "zeroshot_classification" --dataset "imagenet_sketch" --dataset_root ./data/imagenet-sketch/ \
142
+ --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
143
+ ```
144
+
145
+ Expected results:
146
+
147
+ ```
148
+ {"dataset": "imagenet_sketch", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
149
+ "metrics": {"acc1": 0.7385879070133035, "acc5": 0.9199827074613374, "mean_per_class_recall": 0.7386403921568627}, "language": "en"}
150
+ ```
151
+
152
+ </details>
153
+
154
+ <details>
155
+ <summary>[InternVL-C] ObjectNet</summary>
156
+
157
+ ```bash
158
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
159
+ --task "zeroshot_classification" --dataset "objectnet" --dataset_root ./data/objectnet-1.0/ \
160
+ --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
161
+ ```
162
+
163
+ Expected results:
164
+
165
+ ```
166
+ {"dataset": "objectnet", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
167
+ "metrics": {"acc1": 0.8059114891784215, "acc5": 0.9387853989447615, "mean_per_class_recall": 0.797040815749882}, "language": "en"}
168
+ ```
169
+
170
+ </details>
171
+
172
+ ### Multilingual ImageNet-1K
173
+
174
+ | model name | IN-1K (EN) | IN-1K (ZH) | IN-1K (JP) | IN-1K (AR) | IN-1K (IT) | average |
175
+ | :--------: | :--------: | :--------: | :--------: | :--------: | :--------: | :-----: |
176
+ | InternVL-C | 83.2 | 64.5 | 61.5 | 44.9 | 65.7 | 64.0 |
177
+
178
+ <details>
179
+ <summary>[InternVL-C] ImageNet-1K val (ZH, Chinese)</summary>
180
+
181
+ ```bash
182
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" \
183
+ --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
184
+ --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
185
+ ```
186
+
187
+ Expected results:
188
+
189
+ ```
190
+ {"dataset": "imagenet1k", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
191
+ "metrics": {"acc1": 0.6446, "acc5": 0.87842, "mean_per_class_recall": 0.6444200000000001}, "language": "cn"}
192
+ ```
193
+
194
+ </details>
195
+
196
+ <details>
197
+ <summary>[InternVL-C] ImageNet-1K val (JP, Japanese)</summary>
198
+
199
+ ```bash
200
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "jp" \
201
+ --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
202
+ --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
203
+ ```
204
+
205
+ Expected results:
206
+
207
+ ```
208
+ {"dataset": "imagenet1k", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
209
+ "metrics": {"acc1": 0.61488, "acc5": 0.81146, "mean_per_class_recall": 0.6140599999999999}, "language": "jp"}
210
+ ```
211
+
212
+ </details>
213
+
214
+ <details>
215
+ <summary>[InternVL-C] ImageNet-1K val (AR, Arabic)</summary>
216
+
217
+ ```bash
218
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "ar" \
219
+ --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
220
+ --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
221
+ ```
222
+
223
+ Expected results:
224
+
225
+ ```
226
+ {"dataset": "imagenet1k", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
227
+ "metrics": {"acc1": 0.4486, "acc5": 0.66418, "mean_per_class_recall": 0.44764}, "language": "ar"}
228
+ ```
229
+
230
+ </details>
231
+
232
+ <details>
233
+ <summary>[InternVL-C] ImageNet-1K val (IT, Italian)</summary>
234
+
235
+ ```bash
236
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "it" \
237
+ --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
238
+ --model internvl_c_classification --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
239
+ ```
240
+
241
+ Expected results:
242
+
243
+ ```
244
+ {"dataset": "imagenet1k", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
245
+ "metrics": {"acc1": 0.65686, "acc5": 0.85254, "mean_per_class_recall": 0.6557799999999999}, "language": "it"}
246
+ ```
247
+
248
+ </details>
249
+
250
+ ### Other Datasets
251
+
252
+ <img width="1219" alt="image" src="https://github.com/OpenGVLab/InternVL/assets/23737120/5de18a6c-8979-432d-bcb6-eb7796b4a08f">
253
+
254
+ <details>
255
+ <summary>[InternVL-C] CIFAR-10</summary>
256
+
257
+ ```bash
258
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
259
+ --dataset "cifar10" --dataset_root ./data/ --model internvl_c_classification \
260
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
261
+ ```
262
+
263
+ Expected results:
264
+
265
+ ```
266
+ {"dataset": "cifar10", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
267
+ "metrics": {"acc1": 0.9935, "acc5": 0.9996, "mean_per_class_recall": 0.9935}, "language": "en"}
268
+ ```
269
+
270
+ </details>
271
+
272
+ <details>
273
+ <summary>[InternVL-C] CIFAR-100</summary>
274
+
275
+ ```bash
276
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
277
+ --dataset "cifar100" --dataset_root ./data/ --model internvl_c_classification \
278
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
279
+ ```
280
+
281
+ Expected results:
282
+
283
+ ```
284
+ {"dataset": "cifar100", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
285
+ "metrics": {"acc1": 0.9315, "acc5": 0.9925, "mean_per_class_recall": 0.9314}, "language": "en"}
286
+ ```
287
+
288
+ </details>
289
+
290
+ <details>
291
+ <summary>[InternVL-C] MNIST</summary>
292
+
293
+ ```bash
294
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
295
+ --dataset "mnist" --dataset_root ./data/ --model internvl_c_classification \
296
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
297
+ ```
298
+
299
+ Expected results:
300
+
301
+ ```
302
+ {"dataset": "mnist", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
303
+ "metrics": {"acc1": 0.806, "acc5": 0.9743, "mean_per_class_recall": 0.8028667364603377}, "language": "en"}
304
+ ```
305
+
306
+ </details>
307
+
308
+ <details>
309
+ <summary>[InternVL-C] Caltech-101</summary>
310
+
311
+ ```bash
312
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
313
+ --dataset "caltech101" --dataset_root ./data/ --model internvl_c_classification \
314
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
315
+ ```
316
+
317
+ Expected results:
318
+
319
+ ```
320
+ {"dataset": "caltech101", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
321
+ "metrics": {"acc1": 0.8949037620297463, "acc5": 0.9847987751531059, "mean_per_class_recall": 0.9548738053818752}, "language": "en"}
322
+ ```
323
+
324
+ </details>
325
+
326
+ <details>
327
+ <summary>[InternVL-C] SUN397</summary>
328
+
329
+ ```bash
330
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
331
+ --dataset "sun397" --dataset_root ./data/ --model internvl_c_classification \
332
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
333
+ ```
334
+
335
+ Expected results:
336
+
337
+ ```
338
+ {"dataset": "sun397", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
339
+ "metrics": {"acc1": 0.7600180223256157, "acc5": 0.9623370174890119, "mean_per_class_recall": 0.7641970904214413}, "language": "en"}
340
+ ```
341
+
342
+ </details>
343
+
344
+ <details>
345
+ <summary>[InternVL-C] FGVC Aircraft</summary>
346
+
347
+ ```bash
348
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
349
+ --dataset "fgvc_aircraft" --dataset_root ./data/ --model internvl_c_classification \
350
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
351
+ ```
352
+
353
+ Expected results:
354
+
355
+ ```
356
+ {"dataset": "fgvc_aircraft", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
357
+ "metrics": {"acc1": 0.5271527152715272, "acc5": 0.9426942694269427, "mean_per_class_recall": 0.5255169340463458}, "language": "en"}
358
+ ```
359
+
360
+ </details>
361
+
362
+ <details>
363
+ <summary>[InternVL-C] Country-211</summary>
364
+
365
+ ```bash
366
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
367
+ --dataset "country211" --dataset_root ./data/ --model internvl_c_classification \
368
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
369
+ ```
370
+
371
+ Expected results:
372
+
373
+ ```
374
+ {"dataset": "country211", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
375
+ "metrics": {"acc1": 0.34080568720379145, "acc5": 0.6048815165876777, "mean_per_class_recall": 0.3406635071090047}, "language": "en"}
376
+ ```
377
+
378
+ </details>
379
+
380
+ <details>
381
+ <summary>[InternVL-C] Stanford Cars</summary>
382
+
383
+ ```bash
384
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
385
+ --dataset "cars" --dataset_root ./data/ --model internvl_c_classification \
386
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
387
+ ```
388
+
389
+ Expected results:
390
+
391
+ ```
392
+ {"dataset": "cars", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
393
+ "metrics": {"acc1": 0.9416739211540853, "acc5": 0.99950254943415, "mean_per_class_recall": 0.9416684924576828}, "language": "en"}
394
+ ```
395
+
396
+ </details>
397
+
398
+ <details>
399
+ <summary>[InternVL-C] Birdsnap</summary>
400
+
401
+ ```bash
402
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
403
+ --dataset "birdsnap" --dataset_root ./data/birdsnap/ --model internvl_c_classification \
404
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
405
+ ```
406
+
407
+ Expected results:
408
+
409
+ ```
410
+ {"dataset": "birdsnap", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
411
+ "metrics": {"acc1": 0.7203252032520325, "acc5": 0.9636856368563685, "mean_per_class_recall": 0.7027551020408164}, "language": "en"}
412
+ ```
413
+
414
+ </details>
415
+
416
+ <details>
417
+ <summary>[InternVL-C] DTD</summary>
418
+
419
+ ```bash
420
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
421
+ --dataset "dtd" --dataset_root ./data/ --model internvl_c_classification \
422
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
423
+ ```
424
+
425
+ Expected results:
426
+
427
+ ```
428
+ {"dataset": "dtd", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
429
+ "metrics": {"acc1": 0.7074468085106383, "acc5": 0.9367021276595745, "mean_per_class_recall": 0.7079787234042553}, "language": "en"}
430
+ ```
431
+
432
+ </details>
433
+
434
+ <details>
435
+ <summary>[InternVL-C] Eurosat</summary>
436
+
437
+ ```bash
438
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
439
+ --dataset "eurosat" --dataset_root ./data/ --model internvl_c_classification \
440
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
441
+ ```
442
+
443
+ Expected results:
444
+
445
+ ```
446
+ {"dataset": "eurosat", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
447
+ "metrics": {"acc1": 0.7937407407407407, "acc5": 0.9984074074074074, "mean_per_class_recall": 0.8013766666666665}, "language": "en"}
448
+ ```
449
+
450
+ </details>
451
+
452
+ <details>
453
+ <summary>[InternVL-C] FER2013</summary>
454
+
455
+ ```bash
456
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
457
+ --dataset "fer2013" --dataset_root ./data/fer2013 --model internvl_c_classification \
458
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
459
+ ```
460
+
461
+ Expected results:
462
+
463
+ ```
464
+ {"dataset": "fer2013", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
465
+ "metrics": {"acc1": 0.561994984675397, "acc5": 0.9732516021175815, "mean_per_class_recall": 0.5305440899910082}, "language": "en"}
466
+ ```
467
+
468
+ </details>
469
+
470
+ <details>
471
+ <summary>[InternVL-C] Flowers-102</summary>
472
+
473
+ ```bash
474
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
475
+ --dataset "vtab/flowers" --dataset_root ./data/ --model internvl_c_classification \
476
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
477
+ ```
478
+
479
+ Expected results:
480
+
481
+ ```
482
+ {"dataset": "vtab/flowers", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
483
+ "metrics": {"acc1": 0.8606277443486746, "acc5": 0.953651000162628, "mean_per_class_recall": 0.8563173902114554}, "language": "en"}
484
+ ```
485
+
486
+ </details>
487
+
488
+ <details>
489
+ <summary>[InternVL-C] Food-101</summary>
490
+
491
+ ```bash
492
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
493
+ --dataset "food101" --dataset_root ./data/ --model internvl_c_classification \
494
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
495
+ ```
496
+
497
+ Expected results:
498
+
499
+ ```
500
+ {"dataset": "food101", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
501
+ "metrics": {"acc1": 0.9526336633663366, "acc5": 0.9954851485148515, "mean_per_class_recall": 0.9527524752475246}, "language": "en"}
502
+ ```
503
+
504
+ </details>
505
+
506
+ <details>
507
+ <summary>[InternVL-C] GTSRB</summary>
508
+
509
+ ```bash
510
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
511
+ --dataset "gtsrb" --dataset_root ./data/ --model internvl_c_classification \
512
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
513
+ ```
514
+
515
+ Expected results:
516
+
517
+ ```
518
+ {"dataset": "gtsrb", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
519
+ "metrics": {"acc1": 0.6548693586698338, "acc5": 0.9089469517022961, "mean_per_class_recall": 0.5775180283147926}, "language": "en"}
520
+ ```
521
+
522
+ </details>
523
+
524
+ <details>
525
+ <summary>[InternVL-C] Pets</summary>
526
+
527
+ ```bash
528
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
529
+ --dataset "pets" --dataset_root ./data/ --model internvl_c_classification \
530
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
531
+ ```
532
+
533
+ Expected results:
534
+
535
+ ```
536
+ {"dataset": "pets", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
537
+ "metrics": {"acc1": 0.9604796947397111, "acc5": 0.9991823385118561, "mean_per_class_recall": 0.9602545246926443}, "language": "en"}
538
+ ```
539
+
540
+ </details>
541
+
542
+ <details>
543
+ <summary>[InternVL-C] Rendered SST2</summary>
544
+
545
+ ```bash
546
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
547
+ --dataset "renderedsst2" --dataset_root ./data/ --model internvl_c_classification \
548
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
549
+ ```
550
+
551
+ Expected results:
552
+
553
+ ```
554
+ {"dataset": "renderedsst2", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
555
+ "metrics": {"acc1": 0.6792970895112576, "acc5": NaN, "mean_per_class_recall": 0.6792944097041282}, "language": "en"}
556
+ ```
557
+
558
+ </details>
559
+
560
+ <details>
561
+ <summary>[InternVL-C] Resisc45</summary>
562
+
563
+ ```bash
564
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
565
+ --dataset "vtab/resisc45" --dataset_root ./data/ --model internvl_c_classification \
566
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
567
+ ```
568
+
569
+ Expected results:
570
+
571
+ ```
572
+ {"dataset": "vtab/resisc45", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
573
+ "metrics": {"acc1": 0.7422631328360577, "acc5": 0.9663545468973179, "mean_per_class_recall": 0.7481098478511045}, "language": "en"}
574
+ ```
575
+
576
+ </details>
577
+
578
+ <details>
579
+ <summary>[InternVL-C] STL10</summary>
580
+
581
+ ```bash
582
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
583
+ --dataset "stl10" --dataset_root ./data/ --model internvl_c_classification \
584
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
585
+ ```
586
+
587
+ Expected results:
588
+
589
+ ```
590
+ {"dataset": "stl10", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
591
+ "metrics": {"acc1": 0.9945, "acc5": 1.0, "mean_per_class_recall": 0.9945}, "language": "en"}
592
+ ```
593
+
594
+ </details>
595
+
596
+ <details>
597
+ <summary>[InternVL-C] VOC2007</summary>
598
+
599
+ ```bash
600
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
601
+ --dataset "voc2007" --dataset_root ./data/ --model internvl_c_classification \
602
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
603
+ ```
604
+
605
+ Expected results:
606
+
607
+ ```
608
+ {"dataset": "voc2007", "model": "internvl_c_classification", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_classification",
609
+ "metrics": {"acc1": 0.7997462606837606, "acc5": 0.9795005341880342, "mean_per_class_recall": 0.9048832641726575}, "language": "en"}
610
+ ```
611
+
612
+ </details>
613
+
614
+ ## 📊 Evaluation: Zero-Shot Image-Text Retrieval
615
+
616
+ ### Flickr30K & COCO
617
+
618
+ <table>
619
+ <tr align=center>
620
+ <td rowspan="3" align=center><b>model</b></td>
621
+ <td colspan="6" align=center><b>Flickr30K</b></td>
622
+ <td colspan="6" align=center><b>COCO</b></td>
623
+ <td rowspan="3" align=center><b>avg</b></td>
624
+
625
+ </tr>
626
+ <tr align=center>
627
+ <td colspan="3" align=center><b>image-to-text</b></td>
628
+ <td colspan="3" align=center><b>text-to-image</b></td>
629
+ <td colspan="3" align=center><b>image-to-text</b></td>
630
+ <td colspan="3" align=center><b>text-to-image</b></td>
631
+ </tr>
632
+ <tr>
633
+ <td>R@1</td>
634
+ <td>R@5</td>
635
+ <td>R@10</td>
636
+ <td>R@1</td>
637
+ <td>R@5</td>
638
+ <td>R@10</td>
639
+ <td>R@1</td>
640
+ <td>R@5</td>
641
+ <td>R@10</td>
642
+ <td>R@1</td>
643
+ <td>R@5</td>
644
+ <td>R@10</td>
645
+ </tr>
646
+
647
+ <tr align=center>
648
+ <td>InternVL-C</td>
649
+ <td>94.7</td>
650
+ <td>99.6</td>
651
+ <td>99.9</td>
652
+ <td>81.7</td>
653
+ <td>96.0</td>
654
+ <td>98.2</td>
655
+ <td>70.6</td>
656
+ <td>89.0</td>
657
+ <td>93.5</td>
658
+ <td>54.1</td>
659
+ <td>77.3</td>
660
+ <td>84.6</td>
661
+ <td>86.6</td>
662
+ </tr>
663
+ <tr align=center>
664
+ <td>InternVL-G</td>
665
+ <td>95.7</td>
666
+ <td>99.7</td>
667
+ <td>99.9</td>
668
+ <td>85.0</td>
669
+ <td>97.0</td>
670
+ <td>98.6</td>
671
+ <td>74.9</td>
672
+ <td>91.3</td>
673
+ <td>95.2</td>
674
+ <td>58.6</td>
675
+ <td>81.3</td>
676
+ <td>88.0</td>
677
+ <td>88.8</td>
678
+ </tr>
679
+
680
+ </table>
681
+
682
+ <details>
683
+ <summary>[InternVL-C] Flickr30K</summary>
684
+
685
+ ```bash
686
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
687
+ --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval \
688
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
689
+ ```
690
+
691
+ Expected results:
692
+
693
+ ```
694
+ {"dataset": "flickr30k", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval",
695
+ "metrics": {"image_retrieval_recall@1": 0.8166000247001648, "text_retrieval_recall@1": 0.9470000267028809,
696
+ "image_retrieval_recall@5": 0.9603999853134155, "text_retrieval_recall@5": 0.9959999918937683,
697
+ "image_retrieval_recall@10": 0.9819999933242798, "text_retrieval_recall@10": 0.9990000128746033}, "language": "en"}
698
+ ```
699
+
700
+ </details>
701
+
702
+ <details>
703
+ <summary>[InternVL-C] COCO</summary>
704
+
705
+ ```bash
706
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
707
+ --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
708
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
709
+ ```
710
+
711
+ Expected results:
712
+
713
+ ```
714
+ {"dataset": "mscoco_captions", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval",
715
+ "metrics": {"image_retrieval_recall@1": 0.5411835312843323, "text_retrieval_recall@1": 0.7059999704360962,
716
+ "image_retrieval_recall@5": 0.7731707096099854, "text_retrieval_recall@5": 0.8902000188827515,
717
+ "image_retrieval_recall@10": 0.8463414907455444, "text_retrieval_recall@10": 0.9354000091552734}, "language": "en"}
718
+ ```
719
+
720
+ </details>
721
+
722
+ <details>
723
+ <summary>[InternVL-G] Flickr30K</summary>
724
+
725
+ ```bash
726
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
727
+ --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
728
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json
729
+ ```
730
+
731
+ Expected results:
732
+
733
+ ```
734
+ {"dataset": "flickr30k", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval",
735
+ "metrics": {"image_retrieval_recall@1": 0.8497999906539917, "text_retrieval_recall@1": 0.9570000171661377,
736
+ "image_retrieval_recall@5": 0.9700000286102295, "text_retrieval_recall@5": 0.996999979019165,
737
+ "image_retrieval_recall@10": 0.98580002784729, "text_retrieval_recall@10": 0.9990000128746033}, "language": "en"}
738
+ ```
739
+
740
+ </details>
741
+
742
+ <details>
743
+ <summary>[InternVL-G] COCO</summary>
744
+
745
+ ```bash
746
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
747
+ --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
748
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json
749
+ ```
750
+
751
+ Expected results:
752
+
753
+ ```
754
+ {"dataset": "mscoco_captions", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval",
755
+ "metrics": {"image_retrieval_recall@1": 0.5858056545257568, "text_retrieval_recall@1": 0.7491999864578247,
756
+ "image_retrieval_recall@5": 0.813194751739502, "text_retrieval_recall@5": 0.9129999876022339,
757
+ "image_retrieval_recall@10": 0.8795281648635864, "text_retrieval_recall@10": 0.9521999955177307}, "language": "en"}
758
+ ```
759
+
760
+ </details>
761
+
762
+ ### Flickr30K-CN & COCO-CN
763
+
764
+ <table>
765
+ <tr align=center>
766
+ <td rowspan="3" align=center><b>model</b></td>
767
+ <td colspan="6" align=center><b>Flickr30K-CN</b></td>
768
+ <td colspan="6" align=center><b>COCO-CN</b></td>
769
+ <td rowspan="3" align=center><b>avg</b></td>
770
+
771
+ </tr>
772
+ <tr align=center>
773
+ <td colspan="3" align=center><b>image-to-text</b></td>
774
+ <td colspan="3" align=center><b>text-to-image</b></td>
775
+ <td colspan="3" align=center><b>image-to-text</b></td>
776
+ <td colspan="3" align=center><b>text-to-image</b></td>
777
+ </tr>
778
+ <tr>
779
+ <td>R@1</td>
780
+ <td>R@5</td>
781
+ <td>R@10</td>
782
+ <td>R@1</td>
783
+ <td>R@5</td>
784
+ <td>R@10</td>
785
+ <td>R@1</td>
786
+ <td>R@5</td>
787
+ <td>R@10</td>
788
+ <td>R@1</td>
789
+ <td>R@5</td>
790
+ <td>R@10</td>
791
+ </tr>
792
+
793
+ <tr align=center>
794
+ <td>InternVL-C</td>
795
+ <td>90.3</td>
796
+ <td>98.8</td>
797
+ <td>99.7</td>
798
+ <td>75.1</td>
799
+ <td>92.9</td>
800
+ <td>96.4</td>
801
+ <td>68.8</td>
802
+ <td>92.0</td>
803
+ <td>96.7</td>
804
+ <td>68.9</td>
805
+ <td>91.9</td>
806
+ <td>96.5</td>
807
+ <td>89.0</td>
808
+ </tr>
809
+ <tr align=center>
810
+ <td>InternVL-G</td>
811
+ <td>92.9</td>
812
+ <td>99.4</td>
813
+ <td>99.8</td>
814
+ <td>77.7</td>
815
+ <td>94.8</td>
816
+ <td>97.3</td>
817
+ <td>71.4</td>
818
+ <td>93.9</td>
819
+ <td>97.7</td>
820
+ <td>73.8</td>
821
+ <td>94.4</td>
822
+ <td>98.1</td>
823
+ <td>90.9</td>
824
+ </tr>
825
+
826
+ </table>
827
+
828
+ <details>
829
+ <summary>[InternVL-C] Flickr30K-CN</summary>
830
+
831
+ ```bash
832
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
833
+ --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval \
834
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
835
+ ```
836
+
837
+ Expected results:
838
+
839
+ ```
840
+ {"dataset": "flickr30k", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval",
841
+ "metrics": {"image_retrieval_recall@1": 0.7509999871253967, "text_retrieval_recall@1": 0.902999997138977,
842
+ "image_retrieval_recall@5": 0.9290000200271606, "text_retrieval_recall@5": 0.9879999756813049,
843
+ "image_retrieval_recall@10": 0.9638000130653381, "text_retrieval_recall@10": 0.996999979019165}, "language": "cn"}
844
+ ```
845
+
846
+ </details>
847
+
848
+ <details>
849
+ <summary>[InternVL-C] COCO-CN</summary>
850
+
851
+ ```bash
852
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
853
+ --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
854
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
855
+ ```
856
+
857
+ Expected results:
858
+
859
+ ```
860
+ {"dataset": "mscoco_captions", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval",
861
+ "metrics": {"image_retrieval_recall@1": 0.6885090470314026, "text_retrieval_recall@1": 0.6880000233650208,
862
+ "image_retrieval_recall@5": 0.9192782640457153, "text_retrieval_recall@5": 0.9200000166893005,
863
+ "image_retrieval_recall@10": 0.9648622870445251, "text_retrieval_recall@10": 0.9670000076293945}, "language": "cn"}
864
+ ```
865
+
866
+ </details>
867
+
868
+ <details>
869
+ <summary>[InternVL-G] Flickr30K-CN</summary>
870
+
871
+ ```bash
872
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
873
+ --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
874
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json
875
+ ```
876
+
877
+ Expected results:
878
+
879
+ ```
880
+ {"dataset": "flickr30k", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval",
881
+ "metrics": {"image_retrieval_recall@1": 0.7767999768257141, "text_retrieval_recall@1": 0.9290000200271606,
882
+ "image_retrieval_recall@5": 0.9476000070571899, "text_retrieval_recall@5": 0.9940000176429749,
883
+ "image_retrieval_recall@10": 0.9728000164031982, "text_retrieval_recall@10": 0.9980000257492065}, "language": "cn"}
884
+
885
+ ```
886
+
887
+ </details>
888
+
889
+ <details>
890
+ <summary>[InternVL-G] COCO-CN</summary>
891
+
892
+ ```bash
893
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
894
+ --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
895
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json
896
+ ```
897
+
898
+ Expected results:
899
+
900
+ ```
901
+ {"dataset": "mscoco_captions", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval",
902
+ "metrics": {"image_retrieval_recall@1": 0.7378917336463928, "text_retrieval_recall@1": 0.7139999866485596,
903
+ "image_retrieval_recall@5": 0.9439696073532104, "text_retrieval_recall@5": 0.9390000104904175,
904
+ "image_retrieval_recall@10": 0.9810066223144531, "text_retrieval_recall@10": 0.9769999980926514}, "language": "cn"}
905
+ ```
906
+
907
+ </details>
908
+
909
+ ### XTD
910
+
911
+ | model name | EN | ES | FR | ZH | IT | KO | RU | JP | average |
912
+ | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :--: | :-----: |
913
+ | InternVL-C | 97.3 | 95.7 | 95.1 | 95.6 | 96.0 | 92.2 | 93.3 | 95.5 | 95.1 |
914
+ | InternVL-G | 98.6 | 97.7 | 96.5 | 96.7 | 96.9 | 95.1 | 94.8 | 96.1 | 96.6 |
915
+
916
+ <details>
917
+ <summary>[InternVL-C] XTD</summary>
918
+
919
+ ```bash
920
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
921
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
922
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=en
923
+
924
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
925
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
926
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=es
927
+
928
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
929
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
930
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=fr
931
+
932
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
933
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
934
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=zh
935
+
936
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
937
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
938
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=it
939
+
940
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
941
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
942
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=ko
943
+
944
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
945
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
946
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=ru
947
+
948
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
949
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
950
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=jp
951
+ ```
952
+
953
+ Expected results:
954
+
955
+ ```
956
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.7670000195503235, "text_retrieval_recall@1": 0.7480000257492065, "image_retrieval_recall@5": 0.9200000166893005, "text_retrieval_recall@5": 0.921999990940094, "image_retrieval_recall@10": 0.9670000076293945, "text_retrieval_recall@10": 0.9729999899864197}, "language": "en"}
957
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.7059999704360962, "text_retrieval_recall@1": 0.7009999752044678, "image_retrieval_recall@5": 0.9020000100135803, "text_retrieval_recall@5": 0.8960000276565552, "image_retrieval_recall@10": 0.9430000185966492, "text_retrieval_recall@10": 0.9570000171661377}, "language": "es"}
958
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6970000267028809, "text_retrieval_recall@1": 0.6899999976158142, "image_retrieval_recall@5": 0.8830000162124634, "text_retrieval_recall@5": 0.8889999985694885, "image_retrieval_recall@10": 0.9350000023841858, "text_retrieval_recall@10": 0.9509999752044678}, "language": "fr"}
959
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6480000019073486, "text_retrieval_recall@1": 0.6710000038146973, "image_retrieval_recall@5": 0.8759999871253967, "text_retrieval_recall@5": 0.8769999742507935, "image_retrieval_recall@10": 0.9419999718666077, "text_retrieval_recall@10": 0.9559999704360962}, "language": "zh"}
960
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6790000200271606, "text_retrieval_recall@1": 0.7039999961853027, "image_retrieval_recall@5": 0.8989999890327454, "text_retrieval_recall@5": 0.8999999761581421, "image_retrieval_recall@10": 0.9440000057220459, "text_retrieval_recall@10": 0.9599999785423279}, "language": "it"}
961
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.5830000042915344, "text_retrieval_recall@1": 0.5920000076293945, "image_retrieval_recall@5": 0.8399999737739563, "text_retrieval_recall@5": 0.8360000252723694, "image_retrieval_recall@10": 0.9079999923706055, "text_retrieval_recall@10": 0.921999990940094}, "language": "ko"}
962
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6430000066757202, "text_retrieval_recall@1": 0.6439999938011169, "image_retrieval_recall@5": 0.8510000109672546, "text_retrieval_recall@5": 0.8640000224113464, "image_retrieval_recall@10": 0.9169999957084656, "text_retrieval_recall@10": 0.9330000281333923}, "language": "ru"}
963
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_c_retrieval", "pretrained": "./pretrained/internvl_c_13b_224px.pth", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6330000162124634, "text_retrieval_recall@1": 0.6759999990463257, "image_retrieval_recall@5": 0.875, "text_retrieval_recall@5": 0.8989999890327454, "image_retrieval_recall@10": 0.9359999895095825, "text_retrieval_recall@10": 0.9549999833106995}, "language": "jp"}
964
+ ```
965
+
966
+ </details>
967
+
968
+ <details>
969
+ <summary>[InternVL-G] XTD</summary>
970
+
971
+ ```bash
972
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
973
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
974
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json --language=en
975
+
976
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
977
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
978
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json --language=es
979
+
980
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
981
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
982
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json --language=fr
983
+
984
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
985
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
986
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json --language=zh
987
+
988
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
989
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
990
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json --language=it
991
+
992
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
993
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
994
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json --language=ko
995
+
996
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
997
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
998
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json --language=ru
999
+
1000
+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --task "zeroshot_retrieval" \
1001
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_g_retrieval_hf \
1002
+ --pretrained ./pretrained/InternVL-14B-224px --output result_g.json --language=jp
1003
+ ```
1004
+
1005
+ Expected results:
1006
+
1007
+ ```
1008
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.8119999766349792, "text_retrieval_recall@1": 0.7979999780654907, "image_retrieval_recall@5": 0.9470000267028809, "text_retrieval_recall@5": 0.9480000138282776, "image_retrieval_recall@10": 0.9829999804496765, "text_retrieval_recall@10": 0.9860000014305115}, "language": "en"}
1009
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.7549999952316284, "text_retrieval_recall@1": 0.7450000047683716, "image_retrieval_recall@5": 0.9350000023841858, "text_retrieval_recall@5": 0.925000011920929, "image_retrieval_recall@10": 0.9660000205039978, "text_retrieval_recall@10": 0.9769999980926514}, "language": "es"}
1010
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.7450000047683716, "text_retrieval_recall@1": 0.7279999852180481, "image_retrieval_recall@5": 0.9179999828338623, "text_retrieval_recall@5": 0.9190000295639038, "image_retrieval_recall@10": 0.9620000123977661, "text_retrieval_recall@10": 0.9649999737739563}, "language": "fr"}
1011
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6980000138282776, "text_retrieval_recall@1": 0.6949999928474426, "image_retrieval_recall@5": 0.9120000004768372, "text_retrieval_recall@5": 0.9110000133514404, "image_retrieval_recall@10": 0.9620000123977661, "text_retrieval_recall@10": 0.9670000076293945}, "language": "zh"}
1012
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.7329999804496765, "text_retrieval_recall@1": 0.7450000047683716, "image_retrieval_recall@5": 0.9309999942779541, "text_retrieval_recall@5": 0.9309999942779541, "image_retrieval_recall@10": 0.9639999866485596, "text_retrieval_recall@10": 0.968999981880188}, "language": "it"}
1013
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6430000066757202, "text_retrieval_recall@1": 0.6470000147819519, "image_retrieval_recall@5": 0.8790000081062317, "text_retrieval_recall@5": 0.8769999742507935, "image_retrieval_recall@10": 0.9419999718666077, "text_retrieval_recall@10": 0.9509999752044678}, "language": "ko"}
1014
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6850000023841858, "text_retrieval_recall@1": 0.6899999976158142, "image_retrieval_recall@5": 0.8740000128746033, "text_retrieval_recall@5": 0.8920000195503235, "image_retrieval_recall@10": 0.9390000104904175, "text_retrieval_recall@10": 0.9480000138282776}, "language": "ru"}
1015
+ {"dataset": "multilingual_mscoco_captions", "model": "internvl_g_retrieval_hf", "pretrained": "./pretrained/InternVL-14B-224px", "task": "zeroshot_retrieval", "metrics": {"image_retrieval_recall@1": 0.6850000023841858, "text_retrieval_recall@1": 0.703000009059906, "image_retrieval_recall@5": 0.9020000100135803, "text_retrieval_recall@5": 0.9100000262260437, "image_retrieval_recall@10": 0.9539999961853027, "text_retrieval_recall@10": 0.9610000252723694}, "language": "jp"}
1016
+ ```
1017
+
1018
+ </details>
1019
+
1020
+ ## Original README of CLIP Benchmark
1021
+
1022
+ [![pypi](https://img.shields.io/pypi/v/clip_benchmark.svg)](https://pypi.python.org/pypi/clip_benchmark)
1023
+
1024
+ The goal of this repo is to evaluate CLIP-like models on a standard set
1025
+ of datasets on different tasks such as zero-shot classification and zero-shot
1026
+ retrieval.
1027
+
1028
+ Below we show the average rank (1 is the best, lower is better) of different CLIP models, evaluated
1029
+ on different datasets.
1030
+
1031
+ ![benchmark.png](benchmark.png)
1032
+
1033
+ The current detailed results of the benchmark can be seen [here](benchmark/README.md)
1034
+ or directly in the [notebook](benchmark/results.ipynb).
1035
+
1036
+ ### Features
1037
+
1038
+ - Support for zero-shot classification and zero-shot retrieval
1039
+ - Support for [OpenCLIP](https://github.com/mlfoundations/open_clip) pre-trained models
1040
+ - Support various datasets from [torchvision](https://pytorch.org/vision/stable/datasets.html), [tensorflow datasets](https://www.tensorflow.org/datasets), and [VTAB](https://github.com/google-research/task_adaptation).
1041
+ - Support [Japanese CLIP by rinna](https://github.com/rinnakk/japanese-clip)
1042
+
1043
+ ### How to install?
1044
+
1045
+ `pip install clip-benchmark`
1046
+
1047
+ ### How to use?
1048
+
1049
+ To evaluate we recommend to create a models.txt like
1050
+
1051
+ ```
1052
+ ViT-B-32,openai
1053
+ ```
1054
+
1055
+ to get the list of datasets
1056
+
1057
+ ```
1058
+ wget https://raw.githubusercontent.com/LAION-AI/CLIP_benchmark/main/benchmark/webdatasets.txt
1059
+ ```
1060
+
1061
+ Then to run
1062
+
1063
+ ```
1064
+ clip_benchmark eval --pretrained_model models.txt \
1065
+ --dataset "webdatasets.txt" \
1066
+ --dataset_root "https://huggingface.co/datasets/clip-benchmark/wds_{dataset_cleaned}/tree/main" \
1067
+ --output "benchmark_{dataset}_{pretrained}_{model}_{language}_{task}.json"
1068
+ ```
1069
+
1070
+ Then to get the full table
1071
+
1072
+ ```
1073
+ clip_benchmark build benchmark_*.json --output benchmark.csv
1074
+ ```
1075
+
1076
+ #### Command line interface (CLI)
1077
+
1078
+ The easiest way to benchmark the models is using the CLI, `clip_benchmark`.
1079
+ You can specify the model to use, the dataset and the task to evaluate on. Once it is done, evaluation is performed and
1080
+ the results are written into a JSON file.
1081
+
1082
+ #### Using other models than openclip
1083
+
1084
+ It is possible to use other models than openclip ones. For example japanese-clip is supported
1085
+
1086
+ Here is an example of use
1087
+
1088
+ ```
1089
+ >>> python3 clip_benchmark/cli.py eval \
1090
+ --model_type "ja_clip" \ # flag to use japanese-clip
1091
+ --pretrained "rinna/japanese-cloob-vit-b-16" \ # now, we have `rinna/japanese-cloob-vit-b-16` or `rinna/japanese-clip-vit-b-16`.
1092
+ --language "jp" \
1093
+ --task "zeroshot_classification" \
1094
+ --dataset "imagenet1k" \
1095
+ --dataset_root {ROOT_PATH}
1096
+
1097
+ >>> cat result.json
1098
+ {"dataset": "imagenet1k", "model": "ViT-B-32-quickgelu", "pretrained": "rinna/japanese-cloob-vit-b-16", "task": "zeroshot_classification", "metrics": {"acc1": 0.54636, "acc5": 0.72856, "mean_per_class_recall": 0.54522}, "language": "jp"}
1099
+ ```
1100
+
1101
+ #### How to add other CLIP models
1102
+
1103
+ Please follow these steps:
1104
+
1105
+ 1. Add a identity file to load model in `clip_benchmark/models`
1106
+ 2. Define a loading function, that returns a tuple (model, transform, tokenizer). Please see `clip_benchmark/models/open_clip.py` as an example.
1107
+ 3. Add the function into `TYPE2FUNC` in `clip_benchmark/models/__init__.py`
1108
+
1109
+ Remarks:
1110
+
1111
+ - The new tokenizer/model must enable to do the following things as https://github.com/openai/CLIP#usage
1112
+ - `tokenizer(texts).to(device)` ... `texts` is a list of string
1113
+ - `model.encode_text(tokenized_texts)` ... `tokenized_texts` is a output from `tokenizer(texts).to(device)`
1114
+ - `model.encode_image(images)` ... `images` is a image tensor by the `transform`
1115
+
1116
+ #### CIFAR-10 example
1117
+
1118
+ Here is an example for CIFAR-10 zero-shot classification using OpenCLIP's pre-trained model on LAION-400m:
1119
+
1120
+ `clip_benchmark eval --dataset=cifar10 --task=zeroshot_classification --pretrained=laion400m_e32 --model=ViT-B-32-quickgelu --output=result.json --batch_size=64`
1121
+
1122
+ By default, the dataset is downloaded into `--dataset_root`, which by default is `root`.
1123
+
1124
+ Here is the content of `result.json` after the evaluation is done:
1125
+
1126
+ ```json
1127
+ {
1128
+ "dataset": "cifar10", "model": "ViT-B-32-quickgelu",
1129
+ "pretrained": "laion400m_e32", "task": "zeroshot_classification",
1130
+ "metrics": {"acc1": 0.9074, "acc5": 0.998}
1131
+ }
1132
+ ```
1133
+
1134
+ #### VOC2007 example
1135
+
1136
+ Here is another example with VOC2007, which is a multi-label classification dataset.
1137
+
1138
+ `clip_benchmark eval --dataset=voc2007_multilabel --task=zeroshot_classification --pretrained=laion400m_e32 --model=ViT-B-32-quickgelu --output=result.json --batch_size=64`
1139
+
1140
+ Here is the content of `result.json` after the evaluation is done:
1141
+
1142
+ ```json
1143
+ {"dataset": "voc2007_multilabel", "model": "ViT-B-32-quickgelu", "pretrained": "laion400m_e32", "task": "zeroshot_classification", "metrics": {"mean_average_precision": 0.7627869844436646}}
1144
+ ```
1145
+
1146
+ Here, we compute the mean average precision or mAP, more details about that metric [here](https://fangdahan.medium.com/calculate-mean-average-precision-map-for-multi-label-classification-b082679d31be) in the context of multi-label classification.
1147
+
1148
+ #### VTAB example
1149
+
1150
+ Here is an example on how to run it on [VTAB](https://github.com/google-research/task_adaptation) classification tasks.
1151
+ First, you need to install VTAB's dedicated package.
1152
+
1153
+ `pip install task_adaptation==0.1`
1154
+
1155
+ Then, you can run it by providing the full dataset name.
1156
+ Example with `eurosat`:
1157
+
1158
+ `clip_benchmark eval --dataset=vtab/eurosat --task=zeroshot_classification --pretrained=laion400m_e32 --model=ViT-B-32-quickgelu --output=result.json --batch_size=64`
1159
+
1160
+ See [clip_benchmark/datasets/builder.py#L634](clip_benchmark/datasets/builder.py#L634) for the full list of
1161
+ VTAB dataset collection.
1162
+
1163
+ #### TensorFlow dataset example
1164
+
1165
+ Here is an example on how to run it on [Tensorflow datasets](https://www.tensorflow.org/datasets).
1166
+ First, you need to install `tfds-nightly` and `timm`.
1167
+
1168
+ `pip install timm tfds-nightly`
1169
+
1170
+ The name of the dataset follows the template `tfds/<DATASET_NAME>`.
1171
+
1172
+ Example with `cifar10`:
1173
+
1174
+ `clip_benchmark eval --dataset=tfds/cifar10 --task=zeroshot_classification --pretrained=laion400m_e32 --model=ViT-B-32-quickgelu --output=result.json --batch_size=64`
1175
+
1176
+ #### COCO captions example
1177
+
1178
+ Here is an example for COCO captions zero-shot retrieval:
1179
+
1180
+ `clip_benchmark eval --dataset=mscoco_captions --task=zeroshot_retrieval --pretrained=laion400m_e32 --model=ViT-B-32-quickgelu --output=result.json --batch_size=64`
1181
+
1182
+ Note that for using COCO, you also need to install `pycocotools` (e.g., using `pip install pycocotools`).
1183
+
1184
+ #### Webdataset example
1185
+
1186
+ Here is an example on how to run it on [webdatasets](https://github.com/webdataset/webdataset).
1187
+ First, you need to install `webdataset`.
1188
+
1189
+ `pip install webdataset`
1190
+
1191
+ ##### Creating a webdataset
1192
+
1193
+ You can either convert an already supported CLIP_benchmark dataset to webdataset format, or manually create your own with the same file structure. For already supported datasets use the CLI command `clip_benchmark_export_wds` as in this example:
1194
+
1195
+ ```
1196
+ $ clip_benchmark_export_wds --dataset cifar10 --split train --dataset_root DATA_DIR/ --output wds_cifar10/
1197
+ $ clip_benchmark_export_wds --dataset cifar10 --split test --dataset_root DATA_DIR/ --output wds_cifar10/
1198
+ ```
1199
+
1200
+ which will convert the train and test splits for CIFAR-10 (downloaded to `DATA_DIR/`) and save the webdataset to `wds_cifar10/` (upload to Huggingface Hub must be done manually for now). Retrieval datasets are also supported with the `--retrieval` flag.
1201
+
1202
+ For other datasets, data must be stored with the following file structure:
1203
+
1204
+ ```
1205
+ root_dir/
1206
+ train/
1207
+ nshards.txt
1208
+ 0.tar
1209
+ 1.tar
1210
+ ...
1211
+ test/
1212
+ nshards.txt
1213
+ 0.tar
1214
+ ...
1215
+ classnames.txt
1216
+ zeroshot_classification_templates.txt
1217
+ dataset_type.txt
1218
+ ```
1219
+
1220
+ Each split should be contained in its own folder and `nshards.txt` should contain a single integer corresponding to the number of TAR files. The TAR files should follow webdataset format, with an image file (.webp, .png, or .jpg) and a label (.cls) for each example. Classnames and templates are required for zeroshot classification evaluation, with each classname or template on its own line. Dataset type is required for distinguishing zeroshot retrieval evaluation: the file should just contain the text `retrieval`.
1221
+
1222
+ ##### Evaluating on a webdataset
1223
+
1224
+ The name of the dataset follows the template `wds/<DATASET_NAME>`. Note that the dataset name currently only affects the name in the results output - classnames and templates are loaded directly from the included files. The dataset root directory can be either a local path to the `root_dir` as specified above, or an HTTP URL pointing to a Huggingface Hub dataset file tree.
1225
+
1226
+ Example with `vtab/cifar10`:
1227
+
1228
+ ```
1229
+ $ clip_benchmark eval --dataset wds/vtab/cifar10 --dataset_root ROOT_DIR/wds_vtab-cifar10/
1230
+ $ clip_benchmark eval --dataset wds/vtab/cifar10 --dataset_root https://huggingface.co/datasets/clip-benchmark/wds_vtab-cifar10/tree/main
1231
+ ```
1232
+
1233
+ All other arguments remain the same as in the other examples. See `https://huggingface.co/clip-benchmark` for a full list of datasets that have already been uploaded to Huggingface.
1234
+
1235
+ ### Evaluate mulitple models on multiple datasets
1236
+
1237
+ For the purpose of benchmarking, it is possible to run the CLI with multiple
1238
+ pre-trained models on multiple datasets.
1239
+
1240
+ #### Pretrained models and datasets list as arguments
1241
+
1242
+ For models, we can provide list of pretrained model names in the form of 'model,pretrained' (so `model` and `pretrained` are comma separated). For datasets, we can provide a list of datasets. For languages, we can provide a list of languages.
1243
+ Example:
1244
+
1245
+ ```bash
1246
+ clip_benchmark eval --pretrained_model ViT-B-32-quickgelu,laion400m_e32 ViT-L-14,laion400m_e32 \
1247
+ --dataset cifar10 cifar100 --dataset_root "clip_benchmark_datasets/{dataset}" --language en jp \
1248
+ --output "{dataset}_{pretrained}_{model}_{language}_{task}.json"
1249
+ ```
1250
+
1251
+ Note that `--dataset_root` and `--output` can be now in the form of a template that depends on the dataset/model/language/task (for `--output`) and dataset name (for `--dataset_root`).
1252
+
1253
+ Note that If the benchmark fails at some point, it is possible to resume it by skipping already evaluated models using `--skip_existing`.
1254
+
1255
+ #### Pretrained models and datasets list as files
1256
+
1257
+ We can also provide a path to files with models (each line is in the form of 'model,pretrained' where `model` and `pretrained` are comma separated) and datasets list (one dataset per line):
1258
+
1259
+ ```bash
1260
+ clip_benchmark eval --pretrained_model benchmark/models.txt \
1261
+ --dataset benchmark/datasets.txt --dataset_root "clip_benchmark_datasets/{dataset}" \
1262
+ --output "{dataset}_{pretrained}_{model}_{language}_{task}.json"
1263
+ ```
1264
+
1265
+ Examples are available in [benchmark/datasets.txt](benchmark/datasets.txt) and [benchmark/models.txt](benchmark/models.txt)
1266
+
1267
+ #### Model and dataset collections
1268
+
1269
+ We can also provide model collection names (`openai`, `openclip_base`, `openclip_multilingual`, `openclip_full` are supported) or dataset collection names (`vtab`, `vtab+`, `retrieval`, `imagenet_robustness` are supported):
1270
+
1271
+ ```bash
1272
+ clip_benchmark eval --pretrained_model openai openclip_base --dataset vtab+ retrieval \
1273
+ --dataset_root "clip_benchmark_datasets/{dataset}" --not quiet \
1274
+ --output "{dataset}_{pretrained}_{model}_{language}_{task}.json"
1275
+ ```
1276
+
1277
+ #### Development
1278
+
1279
+ For development, you can also do this:
1280
+
1281
+ ```bash
1282
+ git clone https://github.com/LAION-AI/CLIP_benchmark
1283
+ cd CLIP_benchmark
1284
+ python setup.py install
1285
+ ```
1286
+
1287
+ ### Credits
1288
+
1289
+ - Thanks to [OpenCLIP](https://github.com/mlfoundations/open_clip) authors, zero-shot accuracy code is adapted from there and pre-trained models are used in the command line interface.
1290
+ - Thanks to [SLIP](https://github.com/facebookresearch/SLIP) authors, some zero-shot templates and classnames are from there.
1291
+ - Thanks to [Wise-ft](https://github.com/mlfoundations/wise-ft) authors, Imagenet robustness datasets code is adapted from there
1292
+ - Thanks to [LiT](https://arxiv.org/abs/2111.07991.pdf) authors, some zero-shot templates and classnames of VTAB datasets are from there.
1293
+ - This package was created with [Cookiecutter](https://github.com/audreyr/cookiecutter) and the [audreyr/cookiecutter-pypackage](https://github.com/audreyr/cookiecutter-pypackage) project template. Thanks to the author.
InternVL/clip_benchmark/requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ open_clip_torch>=0.2.1
2
+ opencv-python
3
+ peft>=0.6.2
4
+ protobuf==3.20.3
5
+ pycocoevalcap
6
+ pyyaml
7
+ scikit-learn>=1.0,<2
8
+ scikit-learn
9
+ scipy
10
+ task_adaptation
11
+ tensorflow==2.11.0
12
+ termcolor
13
+ tqdm>=2
14
+ transformers>=4.32.0
15
+ webdataset>=0.2.31
16
+ yacs
InternVL/clip_benchmark/test_internvl_c_retrieval.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ PARTITION=${PARTITION:-'INTERN4'}
4
+ alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
5
+ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
6
+
7
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
8
+ --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval \
9
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
10
+
11
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
12
+ --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
13
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
14
+
15
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
16
+ --dataset "flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval \
17
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
18
+
19
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
20
+ --dataset "mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
21
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json
InternVL/clip_benchmark/test_internvl_c_xtd.sh ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ PARTITION=${PARTITION:-'INTERN4'}
4
+ alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
5
+ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
6
+
7
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
8
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
9
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=en
10
+
11
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_retrieval" \
12
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
13
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=es
14
+
15
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
16
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
17
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=fr
18
+
19
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
20
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
21
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=zh
22
+
23
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
24
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
25
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=it
26
+
27
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
28
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
29
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=ko
30
+
31
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
32
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
33
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=ru
34
+
35
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" --task "zeroshot_retrieval" \
36
+ --dataset "multilingual_mscoco_captions" --dataset_root ./data/mscoco_captions --model internvl_c_retrieval \
37
+ --pretrained ./pretrained/internvl_c_13b_224px.pth --output result.json --language=jp
InternVL/clip_benchmark/test_internvl_g_classification.sh ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ PARTITION=${PARTITION:-'INTERN4'}
4
+ alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
5
+ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
6
+
7
+
8
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
9
+ --dataset "birdsnap" --dataset_root ./data/birdsnap/ --model internvl_g_classification_hf \
10
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
11
+
12
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
13
+ --dataset "cifar10" --dataset_root ./data/ --model internvl_g_classification_hf \
14
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
15
+
16
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
17
+ --dataset "cifar100" --dataset_root ./data/ --model internvl_g_classification_hf \
18
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
19
+
20
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
21
+ --dataset "food101" --dataset_root ./data/ --model internvl_g_classification_hf \
22
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
23
+
24
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
25
+ --dataset "sun397" --dataset_root ./data/ --model internvl_g_classification_hf \
26
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
27
+
28
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
29
+ --dataset "cars" --dataset_root ./data/ --model internvl_g_classification_hf \
30
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
31
+
32
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
33
+ --dataset "fgvc_aircraft" --dataset_root ./data/ --model internvl_g_classification_hf \
34
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
35
+
36
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
37
+ --dataset "dtd" --dataset_root ./data/ --model internvl_g_classification_hf \
38
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
39
+
40
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
41
+ --dataset "pets" --dataset_root ./data/ --model internvl_g_classification_hf \
42
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
43
+
44
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
45
+ --dataset "caltech101" --dataset_root ./data/ --model internvl_g_classification_hf \
46
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
47
+
48
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
49
+ --dataset "mnist" --dataset_root ./data/ --model internvl_g_classification_hf \
50
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
51
+
52
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
53
+ --dataset "stl10" --dataset_root ./data/ --model internvl_g_classification_hf \
54
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
55
+
56
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
57
+ --dataset "eurosat" --dataset_root ./data/ --model internvl_g_classification_hf \
58
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
59
+
60
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
61
+ --dataset "gtsrb" --dataset_root ./data/ --model internvl_g_classification_hf \
62
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
63
+
64
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
65
+ --dataset "country211" --dataset_root ./data/ --model internvl_g_classification_hf \
66
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
67
+
68
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
69
+ --dataset "pcam" --dataset_root ./data/ --model internvl_g_classification_hf \
70
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
71
+
72
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
73
+ --dataset "renderedsst2" --dataset_root ./data/ --model internvl_g_classification_hf \
74
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
75
+
76
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
77
+ --dataset "fer2013" --dataset_root ./data/fer2013 --model internvl_g_classification_hf \
78
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
79
+
80
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
81
+ --dataset "voc2007" --dataset_root ./data/ --model internvl_g_classification_hf \
82
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
83
+
84
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
85
+ --dataset "vtab/flowers" --dataset_root ./data/ --model internvl_g_classification_hf \
86
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
87
+
88
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" --task "zeroshot_classification" \
89
+ --dataset "vtab/resisc45" --dataset_root ./data/ --model internvl_g_classification_hf \
90
+ --pretrained ./pretrained/internvl_14b_224px --output result_g.json
InternVL/clip_benchmark/test_internvl_g_imagenet.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ PARTITION=${PARTITION:-'INTERN4'}
4
+ alias s1a="srun -p ${PARTITION} -N 1 --gres=gpu:1 --cpus-per-task 10 --quotatype=auto"
5
+ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
6
+
7
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
8
+ --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
9
+ --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
10
+
11
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "cn" \
12
+ --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
13
+ --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
14
+
15
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "it" \
16
+ --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
17
+ --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
18
+
19
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "jp" \
20
+ --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
21
+ --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
22
+
23
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "ar" \
24
+ --task "zeroshot_classification" --dataset "imagenet1k" --dataset_root ./data/imagenet-1k/ \
25
+ --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
26
+
27
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
28
+ --task "zeroshot_classification" --dataset "imagenetv2" --dataset_root ./data/imagenetv2/ \
29
+ --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
30
+
31
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
32
+ --task "zeroshot_classification" --dataset "imagenet_sketch" --dataset_root ./data/imagenet-sketch/ \
33
+ --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
34
+
35
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
36
+ --task "zeroshot_classification" --dataset "imagenet-a" --dataset_root ./data/imagenet-a/ \
37
+ --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
38
+
39
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
40
+ --task "zeroshot_classification" --dataset "imagenet-r" --dataset_root ./data/imagenet-r/ \
41
+ --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
42
+
43
+ s1a --async python3 clip_benchmark/cli.py eval --model_type internvl --language "en" \
44
+ --task "zeroshot_classification" --dataset "objectnet" --dataset_root ./data/objectnet-1.0/ \
45
+ --model internvl_g_classification_hf --pretrained ./pretrained/internvl_14b_224px --output result_g.json
InternVL/clip_benchmark/tox.ini ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tox]
2
+ envlist = py36, py37, py38, flake8
3
+
4
+ [travis]
5
+ python =
6
+ 3.8: py38
7
+ 3.7: py37
8
+ 3.6: py36
9
+
10
+ [testenv:flake8]
11
+ basepython = python
12
+ deps = flake8
13
+ commands = flake8 clip_benchmark tests
14
+
15
+ [testenv]
16
+ setenv =
17
+ PYTHONPATH = {toxinidir}
18
+
19
+ commands = python setup.py test
InternVL/segmentation/configs/_base_/datasets/ade20k.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'ADE20KDataset'
3
+ data_root = 'data/ade/ADEChallengeData2016'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 512)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2048, 512),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='SETR_Resize', keep_ratio=True,
28
+ crop_size=crop_size, setr_multi_scale=True),
29
+ # dict(type='ResizeToMultiple', size_divisor=32),
30
+ dict(type='RandomFlip'),
31
+ dict(type='Normalize', **img_norm_cfg),
32
+ dict(type='ImageToTensor', keys=['img']),
33
+ dict(type='Collect', keys=['img']),
34
+ ])
35
+ ]
36
+ data = dict(
37
+ samples_per_gpu=4,
38
+ workers_per_gpu=4,
39
+ train=dict(
40
+ type=dataset_type,
41
+ data_root=data_root,
42
+ img_dir='images/training',
43
+ ann_dir='annotations/training',
44
+ pipeline=train_pipeline),
45
+ val=dict(
46
+ type=dataset_type,
47
+ data_root=data_root,
48
+ img_dir='images/validation',
49
+ ann_dir='annotations/validation',
50
+ pipeline=test_pipeline),
51
+ test=dict(
52
+ type=dataset_type,
53
+ data_root=data_root,
54
+ img_dir='images/validation',
55
+ ann_dir='annotations/validation',
56
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/ade20k_504x504_1of2.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'ADE20KDataset'
3
+ data_root = 'data/ade/ADEChallengeData2016'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (504, 504)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(2016, 504), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2016, 504),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='ResizeToMultiple', size_divisor=14),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img']),
33
+ ])
34
+ ]
35
+ data = dict(
36
+ samples_per_gpu=4,
37
+ workers_per_gpu=4,
38
+ train=dict(
39
+ type=dataset_type,
40
+ data_root=data_root,
41
+ img_dir='images/training',
42
+ ann_dir='annotations/training',
43
+ max_image_num=20210 // 2,
44
+ pipeline=train_pipeline),
45
+ val=dict(
46
+ type=dataset_type,
47
+ data_root=data_root,
48
+ img_dir='images/validation',
49
+ ann_dir='annotations/validation',
50
+ pipeline=test_pipeline),
51
+ test=dict(
52
+ type=dataset_type,
53
+ data_root=data_root,
54
+ img_dir='images/validation',
55
+ ann_dir='annotations/validation',
56
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/ade20k_504x504_1of4.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'ADE20KDataset'
3
+ data_root = 'data/ade/ADEChallengeData2016'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (504, 504)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(2016, 504), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2016, 504),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='ResizeToMultiple', size_divisor=14),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img']),
33
+ ])
34
+ ]
35
+ data = dict(
36
+ samples_per_gpu=4,
37
+ workers_per_gpu=4,
38
+ train=dict(
39
+ type=dataset_type,
40
+ data_root=data_root,
41
+ img_dir='images/training',
42
+ ann_dir='annotations/training',
43
+ max_image_num=20210 // 4,
44
+ pipeline=train_pipeline),
45
+ val=dict(
46
+ type=dataset_type,
47
+ data_root=data_root,
48
+ img_dir='images/validation',
49
+ ann_dir='annotations/validation',
50
+ pipeline=test_pipeline),
51
+ test=dict(
52
+ type=dataset_type,
53
+ data_root=data_root,
54
+ img_dir='images/validation',
55
+ ann_dir='annotations/validation',
56
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/ade20k_504x504_1of8.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'ADE20KDataset'
3
+ data_root = 'data/ade/ADEChallengeData2016'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (504, 504)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(2016, 504), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2016, 504),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='ResizeToMultiple', size_divisor=14),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img']),
33
+ ])
34
+ ]
35
+ data = dict(
36
+ samples_per_gpu=4,
37
+ workers_per_gpu=4,
38
+ train=dict(
39
+ type=dataset_type,
40
+ data_root=data_root,
41
+ img_dir='images/training',
42
+ ann_dir='annotations/training',
43
+ max_image_num=20210 // 8,
44
+ pipeline=train_pipeline),
45
+ val=dict(
46
+ type=dataset_type,
47
+ data_root=data_root,
48
+ img_dir='images/validation',
49
+ ann_dir='annotations/validation',
50
+ pipeline=test_pipeline),
51
+ test=dict(
52
+ type=dataset_type,
53
+ data_root=data_root,
54
+ img_dir='images/validation',
55
+ ann_dir='annotations/validation',
56
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/ade20k_640x640.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'ADE20KDataset'
3
+ data_root = 'data/ade/ADEChallengeData2016'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (640, 640)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(2560, 640), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2560, 640),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='images/training',
41
+ ann_dir='annotations/training',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ data_root=data_root,
46
+ img_dir='images/validation',
47
+ ann_dir='annotations/validation',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ data_root=data_root,
52
+ img_dir='images/validation',
53
+ ann_dir='annotations/validation',
54
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/ade20k_896x896.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'ADE20KDataset'
3
+ data_root = 'data/ade/ADEChallengeData2016'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (896, 896)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(896*4, 896), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(896*4, 896),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='images/training',
41
+ ann_dir='annotations/training',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ data_root=data_root,
46
+ img_dir='images/validation',
47
+ ann_dir='annotations/validation',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ data_root=data_root,
52
+ img_dir='images/validation',
53
+ ann_dir='annotations/validation',
54
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/chase_db1.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'ChaseDB1Dataset'
3
+ data_root = 'data/CHASE_DB1'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ img_scale = (960, 999)
7
+ crop_size = (128, 128)
8
+ train_pipeline = [
9
+ dict(type='LoadImageFromFile'),
10
+ dict(type='LoadAnnotations'),
11
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13
+ dict(type='RandomFlip', prob=0.5),
14
+ dict(type='PhotoMetricDistortion'),
15
+ dict(type='Normalize', **img_norm_cfg),
16
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17
+ dict(type='DefaultFormatBundle'),
18
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile'),
22
+ dict(
23
+ type='MultiScaleFlipAug',
24
+ img_scale=img_scale,
25
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26
+ flip=False,
27
+ transforms=[
28
+ dict(type='Resize', keep_ratio=True),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img'])
33
+ ])
34
+ ]
35
+
36
+ data = dict(
37
+ samples_per_gpu=4,
38
+ workers_per_gpu=4,
39
+ train=dict(
40
+ type='RepeatDataset',
41
+ times=40000,
42
+ dataset=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/training',
46
+ ann_dir='annotations/training',
47
+ pipeline=train_pipeline)),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/validation',
52
+ ann_dir='annotations/validation',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/validation',
58
+ ann_dir='annotations/validation',
59
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/cityscapes.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'CityscapesDataset'
3
+ data_root = 'data/cityscapes/'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 1024)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations'),
10
+ dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2048, 1024),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=2,
36
+ workers_per_gpu=2,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='leftImg8bit/train',
41
+ ann_dir='gtFine/train',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ data_root=data_root,
46
+ img_dir='leftImg8bit/val',
47
+ ann_dir='gtFine/val',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ data_root=data_root,
52
+ img_dir='leftImg8bit/val',
53
+ ann_dir='gtFine/val',
54
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/cityscapes_768x768.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './cityscapes.py'
2
+ img_norm_cfg = dict(
3
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
4
+ crop_size = (768, 768)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile'),
7
+ dict(type='LoadAnnotations'),
8
+ dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)),
9
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
10
+ dict(type='RandomFlip', prob=0.5),
11
+ dict(type='PhotoMetricDistortion'),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
14
+ dict(type='DefaultFormatBundle'),
15
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
16
+ ]
17
+ test_pipeline = [
18
+ dict(type='LoadImageFromFile'),
19
+ dict(
20
+ type='MultiScaleFlipAug',
21
+ img_scale=(2049, 1025),
22
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
23
+ flip=False,
24
+ transforms=[
25
+ dict(type='Resize', keep_ratio=True),
26
+ dict(type='RandomFlip'),
27
+ dict(type='Normalize', **img_norm_cfg),
28
+ dict(type='ImageToTensor', keys=['img']),
29
+ dict(type='Collect', keys=['img']),
30
+ ])
31
+ ]
32
+ data = dict(
33
+ train=dict(pipeline=train_pipeline),
34
+ val=dict(pipeline=test_pipeline),
35
+ test=dict(pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/cityscapes_769x769.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './cityscapes.py'
2
+ img_norm_cfg = dict(
3
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
4
+ crop_size = (769, 769)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile'),
7
+ dict(type='LoadAnnotations'),
8
+ dict(type='Resize', img_scale=(2049, 1025), ratio_range=(0.5, 2.0)),
9
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
10
+ dict(type='RandomFlip', prob=0.5),
11
+ dict(type='PhotoMetricDistortion'),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
14
+ dict(type='DefaultFormatBundle'),
15
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
16
+ ]
17
+ test_pipeline = [
18
+ dict(type='LoadImageFromFile'),
19
+ dict(
20
+ type='MultiScaleFlipAug',
21
+ img_scale=(2049, 1025),
22
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
23
+ flip=False,
24
+ transforms=[
25
+ dict(type='Resize', keep_ratio=True),
26
+ dict(type='RandomFlip'),
27
+ dict(type='Normalize', **img_norm_cfg),
28
+ dict(type='ImageToTensor', keys=['img']),
29
+ dict(type='Collect', keys=['img']),
30
+ ])
31
+ ]
32
+ data = dict(
33
+ train=dict(pipeline=train_pipeline),
34
+ val=dict(pipeline=test_pipeline),
35
+ test=dict(pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/cityscapes_832x832.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './cityscapes.py'
2
+ img_norm_cfg = dict(
3
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
4
+ crop_size = (832, 832)
5
+ train_pipeline = [
6
+ dict(type='LoadImageFromFile'),
7
+ dict(type='LoadAnnotations'),
8
+ dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
9
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
10
+ dict(type='RandomFlip', prob=0.5),
11
+ dict(type='PhotoMetricDistortion'),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
14
+ dict(type='DefaultFormatBundle'),
15
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
16
+ ]
17
+ test_pipeline = [
18
+ dict(type='LoadImageFromFile'),
19
+ dict(
20
+ type='MultiScaleFlipAug',
21
+ img_scale=(2048, 1024),
22
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
23
+ flip=False,
24
+ transforms=[
25
+ dict(type='Resize', keep_ratio=True),
26
+ dict(type='RandomFlip'),
27
+ dict(type='Normalize', **img_norm_cfg),
28
+ dict(type='ImageToTensor', keys=['img']),
29
+ dict(type='Collect', keys=['img']),
30
+ ])
31
+ ]
32
+ data = dict(
33
+ train=dict(pipeline=train_pipeline),
34
+ val=dict(pipeline=test_pipeline),
35
+ test=dict(pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/coco-stuff10k.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'COCOStuffDataset'
3
+ data_root = 'data/coco_stuff10k'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 512)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2048, 512),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ reduce_zero_label=True,
41
+ img_dir='images/train2014',
42
+ ann_dir='annotations/train2014',
43
+ pipeline=train_pipeline),
44
+ val=dict(
45
+ type=dataset_type,
46
+ data_root=data_root,
47
+ reduce_zero_label=True,
48
+ img_dir='images/test2014',
49
+ ann_dir='annotations/test2014',
50
+ pipeline=test_pipeline),
51
+ test=dict(
52
+ type=dataset_type,
53
+ data_root=data_root,
54
+ reduce_zero_label=True,
55
+ img_dir='images/test2014',
56
+ ann_dir='annotations/test2014',
57
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/coco-stuff164k.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'COCOStuffDataset'
3
+ data_root = 'data/coco_stuff164k'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 512)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations'),
10
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2048, 512),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='images/train2017',
41
+ ann_dir='annotations/train2017',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ data_root=data_root,
46
+ img_dir='images/val2017',
47
+ ann_dir='annotations/val2017',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ data_root=data_root,
52
+ img_dir='images/val2017',
53
+ ann_dir='annotations/val2017',
54
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/coco-stuff164k_896x896.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'COCOStuffDataset'
3
+ data_root = 'data/coco_stuff164k'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (896, 896)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations'),
10
+ dict(type='Resize', img_scale=(896 * 4, 896), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(896 * 4, 896),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='images/train2017',
41
+ ann_dir='annotations/train2017',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ data_root=data_root,
46
+ img_dir='images/val2017',
47
+ ann_dir='annotations/val2017',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ data_root=data_root,
52
+ img_dir='images/val2017',
53
+ ann_dir='annotations/val2017',
54
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/drive.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'DRIVEDataset'
3
+ data_root = 'data/DRIVE'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ img_scale = (584, 565)
7
+ crop_size = (64, 64)
8
+ train_pipeline = [
9
+ dict(type='LoadImageFromFile'),
10
+ dict(type='LoadAnnotations'),
11
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13
+ dict(type='RandomFlip', prob=0.5),
14
+ dict(type='PhotoMetricDistortion'),
15
+ dict(type='Normalize', **img_norm_cfg),
16
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17
+ dict(type='DefaultFormatBundle'),
18
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile'),
22
+ dict(
23
+ type='MultiScaleFlipAug',
24
+ img_scale=img_scale,
25
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26
+ flip=False,
27
+ transforms=[
28
+ dict(type='Resize', keep_ratio=True),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img'])
33
+ ])
34
+ ]
35
+
36
+ data = dict(
37
+ samples_per_gpu=4,
38
+ workers_per_gpu=4,
39
+ train=dict(
40
+ type='RepeatDataset',
41
+ times=40000,
42
+ dataset=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/training',
46
+ ann_dir='annotations/training',
47
+ pipeline=train_pipeline)),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/validation',
52
+ ann_dir='annotations/validation',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/validation',
58
+ ann_dir='annotations/validation',
59
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/hrf.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'HRFDataset'
3
+ data_root = 'data/HRF'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ img_scale = (2336, 3504)
7
+ crop_size = (256, 256)
8
+ train_pipeline = [
9
+ dict(type='LoadImageFromFile'),
10
+ dict(type='LoadAnnotations'),
11
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13
+ dict(type='RandomFlip', prob=0.5),
14
+ dict(type='PhotoMetricDistortion'),
15
+ dict(type='Normalize', **img_norm_cfg),
16
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17
+ dict(type='DefaultFormatBundle'),
18
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile'),
22
+ dict(
23
+ type='MultiScaleFlipAug',
24
+ img_scale=img_scale,
25
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26
+ flip=False,
27
+ transforms=[
28
+ dict(type='Resize', keep_ratio=True),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img'])
33
+ ])
34
+ ]
35
+
36
+ data = dict(
37
+ samples_per_gpu=4,
38
+ workers_per_gpu=4,
39
+ train=dict(
40
+ type='RepeatDataset',
41
+ times=40000,
42
+ dataset=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/training',
46
+ ann_dir='annotations/training',
47
+ pipeline=train_pipeline)),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/validation',
52
+ ann_dir='annotations/validation',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/validation',
58
+ ann_dir='annotations/validation',
59
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/isaid.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'iSAIDDataset'
3
+ data_root = 'data/iSAID'
4
+
5
+ img_norm_cfg = dict(
6
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
7
+ """
8
+ This crop_size setting is followed by the implementation of
9
+ `PointFlow: Flowing Semantics Through Points for Aerial Image
10
+ Segmentation <https://arxiv.org/pdf/2103.06564.pdf>`_.
11
+ """
12
+
13
+ crop_size = (896, 896)
14
+
15
+ train_pipeline = [
16
+ dict(type='LoadImageFromFile'),
17
+ dict(type='LoadAnnotations'),
18
+ dict(type='Resize', img_scale=(896, 896), ratio_range=(0.5, 2.0)),
19
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
20
+ dict(type='RandomFlip', prob=0.5),
21
+ dict(type='PhotoMetricDistortion'),
22
+ dict(type='Normalize', **img_norm_cfg),
23
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
24
+ dict(type='DefaultFormatBundle'),
25
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
26
+ ]
27
+ test_pipeline = [
28
+ dict(type='LoadImageFromFile'),
29
+ dict(
30
+ type='MultiScaleFlipAug',
31
+ img_scale=(896, 896),
32
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
33
+ flip=False,
34
+ transforms=[
35
+ dict(type='Resize', keep_ratio=True),
36
+ dict(type='RandomFlip'),
37
+ dict(type='Normalize', **img_norm_cfg),
38
+ dict(type='ImageToTensor', keys=['img']),
39
+ dict(type='Collect', keys=['img']),
40
+ ])
41
+ ]
42
+ data = dict(
43
+ samples_per_gpu=4,
44
+ workers_per_gpu=4,
45
+ train=dict(
46
+ type=dataset_type,
47
+ data_root=data_root,
48
+ img_dir='img_dir/train',
49
+ ann_dir='ann_dir/train',
50
+ pipeline=train_pipeline),
51
+ val=dict(
52
+ type=dataset_type,
53
+ data_root=data_root,
54
+ img_dir='img_dir/val',
55
+ ann_dir='ann_dir/val',
56
+ pipeline=test_pipeline),
57
+ test=dict(
58
+ type=dataset_type,
59
+ data_root=data_root,
60
+ img_dir='img_dir/val',
61
+ ann_dir='ann_dir/val',
62
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/loveda.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'LoveDADataset'
3
+ data_root = 'data/loveDA'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 512)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(1024, 1024),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='img_dir/train',
41
+ ann_dir='ann_dir/train',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ data_root=data_root,
46
+ img_dir='img_dir/val',
47
+ ann_dir='ann_dir/val',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ data_root=data_root,
52
+ img_dir='img_dir/val',
53
+ ann_dir='ann_dir/val',
54
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/pascal_context.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'PascalContextDataset'
3
+ data_root = 'data/VOCdevkit/VOC2010/'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+
7
+ img_scale = (520, 520)
8
+ crop_size = (480, 480)
9
+
10
+ train_pipeline = [
11
+ dict(type='LoadImageFromFile'),
12
+ dict(type='LoadAnnotations'),
13
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
14
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
15
+ dict(type='RandomFlip', prob=0.5),
16
+ dict(type='PhotoMetricDistortion'),
17
+ dict(type='Normalize', **img_norm_cfg),
18
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
19
+ dict(type='DefaultFormatBundle'),
20
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
21
+ ]
22
+ test_pipeline = [
23
+ dict(type='LoadImageFromFile'),
24
+ dict(
25
+ type='MultiScaleFlipAug',
26
+ img_scale=img_scale,
27
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
28
+ flip=False,
29
+ transforms=[
30
+ dict(type='Resize', keep_ratio=True),
31
+ dict(type='RandomFlip'),
32
+ dict(type='Normalize', **img_norm_cfg),
33
+ dict(type='ImageToTensor', keys=['img']),
34
+ dict(type='Collect', keys=['img']),
35
+ ])
36
+ ]
37
+ data = dict(
38
+ samples_per_gpu=4,
39
+ workers_per_gpu=4,
40
+ train=dict(
41
+ type=dataset_type,
42
+ data_root=data_root,
43
+ img_dir='JPEGImages',
44
+ ann_dir='SegmentationClassContext',
45
+ split='ImageSets/SegmentationContext/train.txt',
46
+ pipeline=train_pipeline),
47
+ val=dict(
48
+ type=dataset_type,
49
+ data_root=data_root,
50
+ img_dir='JPEGImages',
51
+ ann_dir='SegmentationClassContext',
52
+ split='ImageSets/SegmentationContext/val.txt',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='JPEGImages',
58
+ ann_dir='SegmentationClassContext',
59
+ split='ImageSets/SegmentationContext/val.txt',
60
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/pascal_context_59.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'PascalContextDataset59'
3
+ data_root = 'data/VOCdevkit/VOC2010/'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+
7
+ img_scale = (520, 520)
8
+ crop_size = (480, 480)
9
+
10
+ train_pipeline = [
11
+ dict(type='LoadImageFromFile'),
12
+ dict(type='LoadAnnotations', reduce_zero_label=True),
13
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
14
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
15
+ dict(type='RandomFlip', prob=0.5),
16
+ dict(type='PhotoMetricDistortion'),
17
+ dict(type='Normalize', **img_norm_cfg),
18
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
19
+ dict(type='DefaultFormatBundle'),
20
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
21
+ ]
22
+ test_pipeline = [
23
+ dict(type='LoadImageFromFile'),
24
+ dict(
25
+ type='MultiScaleFlipAug',
26
+ img_scale=img_scale,
27
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
28
+ flip=False,
29
+ transforms=[
30
+ dict(type='Resize', keep_ratio=True),
31
+ dict(type='RandomFlip'),
32
+ dict(type='Normalize', **img_norm_cfg),
33
+ dict(type='ImageToTensor', keys=['img']),
34
+ dict(type='Collect', keys=['img']),
35
+ ])
36
+ ]
37
+ data = dict(
38
+ samples_per_gpu=4,
39
+ workers_per_gpu=4,
40
+ train=dict(
41
+ type=dataset_type,
42
+ data_root=data_root,
43
+ img_dir='JPEGImages',
44
+ ann_dir='SegmentationClassContext',
45
+ split='ImageSets/SegmentationContext/train.txt',
46
+ pipeline=train_pipeline),
47
+ val=dict(
48
+ type=dataset_type,
49
+ data_root=data_root,
50
+ img_dir='JPEGImages',
51
+ ann_dir='SegmentationClassContext',
52
+ split='ImageSets/SegmentationContext/val.txt',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='JPEGImages',
58
+ ann_dir='SegmentationClassContext',
59
+ split='ImageSets/SegmentationContext/val.txt',
60
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/pascal_voc12.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'PascalVOCDataset'
3
+ data_root = 'data/VOCdevkit/VOC2012'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 512)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations'),
10
+ dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(2048, 512),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='JPEGImages',
41
+ ann_dir='SegmentationClass',
42
+ split='ImageSets/Segmentation/train.txt',
43
+ pipeline=train_pipeline),
44
+ val=dict(
45
+ type=dataset_type,
46
+ data_root=data_root,
47
+ img_dir='JPEGImages',
48
+ ann_dir='SegmentationClass',
49
+ split='ImageSets/Segmentation/val.txt',
50
+ pipeline=test_pipeline),
51
+ test=dict(
52
+ type=dataset_type,
53
+ data_root=data_root,
54
+ img_dir='JPEGImages',
55
+ ann_dir='SegmentationClass',
56
+ split='ImageSets/Segmentation/val.txt',
57
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/pascal_voc12_aug.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ _base_ = './pascal_voc12.py'
2
+ # dataset settings
3
+ data = dict(
4
+ train=dict(
5
+ ann_dir=['SegmentationClass', 'SegmentationClassAug'],
6
+ split=[
7
+ 'ImageSets/Segmentation/train.txt',
8
+ 'ImageSets/Segmentation/aug.txt'
9
+ ]))
InternVL/segmentation/configs/_base_/datasets/potsdam.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'PotsdamDataset'
3
+ data_root = 'data/potsdam'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 512)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(512, 512), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(512, 512),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='img_dir/train',
41
+ ann_dir='ann_dir/train',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ data_root=data_root,
46
+ img_dir='img_dir/val',
47
+ ann_dir='ann_dir/val',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ data_root=data_root,
52
+ img_dir='img_dir/val',
53
+ ann_dir='ann_dir/val',
54
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/stare.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'STAREDataset'
3
+ data_root = 'data/STARE'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ img_scale = (605, 700)
7
+ crop_size = (128, 128)
8
+ train_pipeline = [
9
+ dict(type='LoadImageFromFile'),
10
+ dict(type='LoadAnnotations'),
11
+ dict(type='Resize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
12
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
13
+ dict(type='RandomFlip', prob=0.5),
14
+ dict(type='PhotoMetricDistortion'),
15
+ dict(type='Normalize', **img_norm_cfg),
16
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
17
+ dict(type='DefaultFormatBundle'),
18
+ dict(type='Collect', keys=['img', 'gt_semantic_seg'])
19
+ ]
20
+ test_pipeline = [
21
+ dict(type='LoadImageFromFile'),
22
+ dict(
23
+ type='MultiScaleFlipAug',
24
+ img_scale=img_scale,
25
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0],
26
+ flip=False,
27
+ transforms=[
28
+ dict(type='Resize', keep_ratio=True),
29
+ dict(type='RandomFlip'),
30
+ dict(type='Normalize', **img_norm_cfg),
31
+ dict(type='ImageToTensor', keys=['img']),
32
+ dict(type='Collect', keys=['img'])
33
+ ])
34
+ ]
35
+
36
+ data = dict(
37
+ samples_per_gpu=4,
38
+ workers_per_gpu=4,
39
+ train=dict(
40
+ type='RepeatDataset',
41
+ times=40000,
42
+ dataset=dict(
43
+ type=dataset_type,
44
+ data_root=data_root,
45
+ img_dir='images/training',
46
+ ann_dir='annotations/training',
47
+ pipeline=train_pipeline)),
48
+ val=dict(
49
+ type=dataset_type,
50
+ data_root=data_root,
51
+ img_dir='images/validation',
52
+ ann_dir='annotations/validation',
53
+ pipeline=test_pipeline),
54
+ test=dict(
55
+ type=dataset_type,
56
+ data_root=data_root,
57
+ img_dir='images/validation',
58
+ ann_dir='annotations/validation',
59
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/datasets/vaihingen.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dataset settings
2
+ dataset_type = 'ISPRSDataset'
3
+ data_root = 'data/vaihingen'
4
+ img_norm_cfg = dict(
5
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
6
+ crop_size = (512, 512)
7
+ train_pipeline = [
8
+ dict(type='LoadImageFromFile'),
9
+ dict(type='LoadAnnotations', reduce_zero_label=True),
10
+ dict(type='Resize', img_scale=(512, 512), ratio_range=(0.5, 2.0)),
11
+ dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
12
+ dict(type='RandomFlip', prob=0.5),
13
+ dict(type='PhotoMetricDistortion'),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
16
+ dict(type='DefaultFormatBundle'),
17
+ dict(type='Collect', keys=['img', 'gt_semantic_seg']),
18
+ ]
19
+ test_pipeline = [
20
+ dict(type='LoadImageFromFile'),
21
+ dict(
22
+ type='MultiScaleFlipAug',
23
+ img_scale=(512, 512),
24
+ # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
25
+ flip=False,
26
+ transforms=[
27
+ dict(type='Resize', keep_ratio=True),
28
+ dict(type='RandomFlip'),
29
+ dict(type='Normalize', **img_norm_cfg),
30
+ dict(type='ImageToTensor', keys=['img']),
31
+ dict(type='Collect', keys=['img']),
32
+ ])
33
+ ]
34
+ data = dict(
35
+ samples_per_gpu=4,
36
+ workers_per_gpu=4,
37
+ train=dict(
38
+ type=dataset_type,
39
+ data_root=data_root,
40
+ img_dir='img_dir/train',
41
+ ann_dir='ann_dir/train',
42
+ pipeline=train_pipeline),
43
+ val=dict(
44
+ type=dataset_type,
45
+ data_root=data_root,
46
+ img_dir='img_dir/val',
47
+ ann_dir='ann_dir/val',
48
+ pipeline=test_pipeline),
49
+ test=dict(
50
+ type=dataset_type,
51
+ data_root=data_root,
52
+ img_dir='img_dir/val',
53
+ ann_dir='ann_dir/val',
54
+ pipeline=test_pipeline))
InternVL/segmentation/configs/_base_/default_runtime.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # yapf:disable
2
+ log_config = dict(
3
+ interval=50,
4
+ hooks=[
5
+ dict(type='TextLoggerHook', by_epoch=False),
6
+ dict(type='TensorboardLoggerHook')
7
+ # dict(type='PaviLoggerHook') # for internal services
8
+ ])
9
+ # yapf:enable
10
+ dist_params = dict(backend='nccl')
11
+ log_level = 'INFO'
12
+ load_from = None
13
+ resume_from = None
14
+ workflow = [('train', 1)]
15
+ cudnn_benchmark = True
InternVL/segmentation/configs/_base_/models/ann_r50-d8.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='ANNHead',
19
+ in_channels=[1024, 2048],
20
+ in_index=[2, 3],
21
+ channels=512,
22
+ project_channels=256,
23
+ query_scales=(1, ),
24
+ key_pool_scales=(1, 3, 6, 8),
25
+ dropout_ratio=0.1,
26
+ num_classes=19,
27
+ norm_cfg=norm_cfg,
28
+ align_corners=False,
29
+ loss_decode=dict(
30
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
31
+ auxiliary_head=dict(
32
+ type='FCNHead',
33
+ in_channels=1024,
34
+ in_index=2,
35
+ channels=256,
36
+ num_convs=1,
37
+ concat_input=False,
38
+ dropout_ratio=0.1,
39
+ num_classes=19,
40
+ norm_cfg=norm_cfg,
41
+ align_corners=False,
42
+ loss_decode=dict(
43
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
44
+ # model training and testing settings
45
+ train_cfg=dict(),
46
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/bisenetv2.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained=None,
6
+ backbone=dict(
7
+ type='BiSeNetV2',
8
+ detail_channels=(64, 64, 128),
9
+ semantic_channels=(16, 32, 64, 128),
10
+ semantic_expansion_ratio=6,
11
+ bga_channels=128,
12
+ out_indices=(0, 1, 2, 3, 4),
13
+ init_cfg=None,
14
+ align_corners=False),
15
+ decode_head=dict(
16
+ type='FCNHead',
17
+ in_channels=128,
18
+ in_index=0,
19
+ channels=1024,
20
+ num_convs=1,
21
+ concat_input=False,
22
+ dropout_ratio=0.1,
23
+ num_classes=19,
24
+ norm_cfg=norm_cfg,
25
+ align_corners=False,
26
+ loss_decode=dict(
27
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
28
+ auxiliary_head=[
29
+ dict(
30
+ type='FCNHead',
31
+ in_channels=16,
32
+ channels=16,
33
+ num_convs=2,
34
+ num_classes=19,
35
+ in_index=1,
36
+ norm_cfg=norm_cfg,
37
+ concat_input=False,
38
+ align_corners=False,
39
+ loss_decode=dict(
40
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
41
+ dict(
42
+ type='FCNHead',
43
+ in_channels=32,
44
+ channels=64,
45
+ num_convs=2,
46
+ num_classes=19,
47
+ in_index=2,
48
+ norm_cfg=norm_cfg,
49
+ concat_input=False,
50
+ align_corners=False,
51
+ loss_decode=dict(
52
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
53
+ dict(
54
+ type='FCNHead',
55
+ in_channels=64,
56
+ channels=256,
57
+ num_convs=2,
58
+ num_classes=19,
59
+ in_index=3,
60
+ norm_cfg=norm_cfg,
61
+ concat_input=False,
62
+ align_corners=False,
63
+ loss_decode=dict(
64
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
65
+ dict(
66
+ type='FCNHead',
67
+ in_channels=128,
68
+ channels=1024,
69
+ num_convs=2,
70
+ num_classes=19,
71
+ in_index=4,
72
+ norm_cfg=norm_cfg,
73
+ concat_input=False,
74
+ align_corners=False,
75
+ loss_decode=dict(
76
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
77
+ ],
78
+ # model training and testing settings
79
+ train_cfg=dict(),
80
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/ccnet_r50-d8.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='CCHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ recurrence=2,
23
+ dropout_ratio=0.1,
24
+ num_classes=19,
25
+ norm_cfg=norm_cfg,
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
29
+ auxiliary_head=dict(
30
+ type='FCNHead',
31
+ in_channels=1024,
32
+ in_index=2,
33
+ channels=256,
34
+ num_convs=1,
35
+ concat_input=False,
36
+ dropout_ratio=0.1,
37
+ num_classes=19,
38
+ norm_cfg=norm_cfg,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
42
+ # model training and testing settings
43
+ train_cfg=dict(),
44
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/cgnet.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ backbone=dict(
6
+ type='CGNet',
7
+ norm_cfg=norm_cfg,
8
+ in_channels=3,
9
+ num_channels=(32, 64, 128),
10
+ num_blocks=(3, 21),
11
+ dilations=(2, 4),
12
+ reductions=(8, 16)),
13
+ decode_head=dict(
14
+ type='FCNHead',
15
+ in_channels=256,
16
+ in_index=2,
17
+ channels=256,
18
+ num_convs=0,
19
+ concat_input=False,
20
+ dropout_ratio=0,
21
+ num_classes=19,
22
+ norm_cfg=norm_cfg,
23
+ loss_decode=dict(
24
+ type='CrossEntropyLoss',
25
+ use_sigmoid=False,
26
+ loss_weight=1.0,
27
+ class_weight=[
28
+ 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
29
+ 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
30
+ 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
31
+ 10.396974, 10.055647
32
+ ])),
33
+ # model training and testing settings
34
+ train_cfg=dict(sampler=None),
35
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/deeplabv3_r50-d8.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='ASPPHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ dilations=(1, 12, 24, 36),
23
+ dropout_ratio=0.1,
24
+ num_classes=19,
25
+ norm_cfg=norm_cfg,
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
29
+ auxiliary_head=dict(
30
+ type='FCNHead',
31
+ in_channels=1024,
32
+ in_index=2,
33
+ channels=256,
34
+ num_convs=1,
35
+ concat_input=False,
36
+ dropout_ratio=0.1,
37
+ num_classes=19,
38
+ norm_cfg=norm_cfg,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
42
+ # model training and testing settings
43
+ train_cfg=dict(),
44
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/deeplabv3_unet_s5-d16.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained=None,
6
+ backbone=dict(
7
+ type='UNet',
8
+ in_channels=3,
9
+ base_channels=64,
10
+ num_stages=5,
11
+ strides=(1, 1, 1, 1, 1),
12
+ enc_num_convs=(2, 2, 2, 2, 2),
13
+ dec_num_convs=(2, 2, 2, 2),
14
+ downsamples=(True, True, True, True),
15
+ enc_dilations=(1, 1, 1, 1, 1),
16
+ dec_dilations=(1, 1, 1, 1),
17
+ with_cp=False,
18
+ conv_cfg=None,
19
+ norm_cfg=norm_cfg,
20
+ act_cfg=dict(type='ReLU'),
21
+ upsample_cfg=dict(type='InterpConv'),
22
+ norm_eval=False),
23
+ decode_head=dict(
24
+ type='ASPPHead',
25
+ in_channels=64,
26
+ in_index=4,
27
+ channels=16,
28
+ dilations=(1, 12, 24, 36),
29
+ dropout_ratio=0.1,
30
+ num_classes=2,
31
+ norm_cfg=norm_cfg,
32
+ align_corners=False,
33
+ loss_decode=dict(
34
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
35
+ auxiliary_head=dict(
36
+ type='FCNHead',
37
+ in_channels=128,
38
+ in_index=3,
39
+ channels=64,
40
+ num_convs=1,
41
+ concat_input=False,
42
+ dropout_ratio=0.1,
43
+ num_classes=2,
44
+ norm_cfg=norm_cfg,
45
+ align_corners=False,
46
+ loss_decode=dict(
47
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
48
+ # model training and testing settings
49
+ train_cfg=dict(),
50
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
InternVL/segmentation/configs/_base_/models/dnl_r50-d8.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='DNLHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ dropout_ratio=0.1,
23
+ reduction=2,
24
+ use_scale=True,
25
+ mode='embedded_gaussian',
26
+ num_classes=19,
27
+ norm_cfg=norm_cfg,
28
+ align_corners=False,
29
+ loss_decode=dict(
30
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
31
+ auxiliary_head=dict(
32
+ type='FCNHead',
33
+ in_channels=1024,
34
+ in_index=2,
35
+ channels=256,
36
+ num_convs=1,
37
+ concat_input=False,
38
+ dropout_ratio=0.1,
39
+ num_classes=19,
40
+ norm_cfg=norm_cfg,
41
+ align_corners=False,
42
+ loss_decode=dict(
43
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
44
+ # model training and testing settings
45
+ train_cfg=dict(),
46
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/dpt_vit-b16.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
2
+ model = dict(
3
+ type='EncoderDecoder',
4
+ pretrained='pretrain/vit-b16_p16_224-80ecf9dd.pth', # noqa
5
+ backbone=dict(
6
+ type='VisionTransformer',
7
+ img_size=224,
8
+ embed_dims=768,
9
+ num_layers=12,
10
+ num_heads=12,
11
+ out_indices=(2, 5, 8, 11),
12
+ final_norm=False,
13
+ with_cls_token=True,
14
+ output_cls_token=True),
15
+ decode_head=dict(
16
+ type='DPTHead',
17
+ in_channels=(768, 768, 768, 768),
18
+ channels=256,
19
+ embed_dims=768,
20
+ post_process_channels=[96, 192, 384, 768],
21
+ num_classes=150,
22
+ readout_type='project',
23
+ input_transform='multiple_select',
24
+ in_index=(0, 1, 2, 3),
25
+ norm_cfg=norm_cfg,
26
+ loss_decode=dict(
27
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
28
+ auxiliary_head=None,
29
+ # model training and testing settings
30
+ train_cfg=dict(),
31
+ test_cfg=dict(mode='whole')) # yapf: disable
InternVL/segmentation/configs/_base_/models/emanet_r50-d8.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='EMAHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=256,
22
+ ema_channels=512,
23
+ num_bases=64,
24
+ num_stages=3,
25
+ momentum=0.1,
26
+ dropout_ratio=0.1,
27
+ num_classes=19,
28
+ norm_cfg=norm_cfg,
29
+ align_corners=False,
30
+ loss_decode=dict(
31
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32
+ auxiliary_head=dict(
33
+ type='FCNHead',
34
+ in_channels=1024,
35
+ in_index=2,
36
+ channels=256,
37
+ num_convs=1,
38
+ concat_input=False,
39
+ dropout_ratio=0.1,
40
+ num_classes=19,
41
+ norm_cfg=norm_cfg,
42
+ align_corners=False,
43
+ loss_decode=dict(
44
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
45
+ # model training and testing settings
46
+ train_cfg=dict(),
47
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/fast_scnn.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True, momentum=0.01)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ backbone=dict(
6
+ type='FastSCNN',
7
+ downsample_dw_channels=(32, 48),
8
+ global_in_channels=64,
9
+ global_block_channels=(64, 96, 128),
10
+ global_block_strides=(2, 2, 1),
11
+ global_out_channels=128,
12
+ higher_in_channels=64,
13
+ lower_in_channels=128,
14
+ fusion_out_channels=128,
15
+ out_indices=(0, 1, 2),
16
+ norm_cfg=norm_cfg,
17
+ align_corners=False),
18
+ decode_head=dict(
19
+ type='DepthwiseSeparableFCNHead',
20
+ in_channels=128,
21
+ channels=128,
22
+ concat_input=False,
23
+ num_classes=19,
24
+ in_index=-1,
25
+ norm_cfg=norm_cfg,
26
+ align_corners=False,
27
+ loss_decode=dict(
28
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1)),
29
+ auxiliary_head=[
30
+ dict(
31
+ type='FCNHead',
32
+ in_channels=128,
33
+ channels=32,
34
+ num_convs=1,
35
+ num_classes=19,
36
+ in_index=-2,
37
+ norm_cfg=norm_cfg,
38
+ concat_input=False,
39
+ align_corners=False,
40
+ loss_decode=dict(
41
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
42
+ dict(
43
+ type='FCNHead',
44
+ in_channels=64,
45
+ channels=32,
46
+ num_convs=1,
47
+ num_classes=19,
48
+ in_index=-3,
49
+ norm_cfg=norm_cfg,
50
+ concat_input=False,
51
+ align_corners=False,
52
+ loss_decode=dict(
53
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
54
+ ],
55
+ # model training and testing settings
56
+ train_cfg=dict(),
57
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/fcn_r50-d8.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='FCNHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ num_convs=2,
23
+ concat_input=True,
24
+ dropout_ratio=0.1,
25
+ num_classes=19,
26
+ norm_cfg=norm_cfg,
27
+ align_corners=False,
28
+ loss_decode=dict(
29
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
30
+ auxiliary_head=dict(
31
+ type='FCNHead',
32
+ in_channels=1024,
33
+ in_index=2,
34
+ channels=256,
35
+ num_convs=1,
36
+ concat_input=False,
37
+ dropout_ratio=0.1,
38
+ num_classes=19,
39
+ norm_cfg=norm_cfg,
40
+ align_corners=False,
41
+ loss_decode=dict(
42
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
43
+ # model training and testing settings
44
+ train_cfg=dict(),
45
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/fcn_unet_s5-d16.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained=None,
6
+ backbone=dict(
7
+ type='UNet',
8
+ in_channels=3,
9
+ base_channels=64,
10
+ num_stages=5,
11
+ strides=(1, 1, 1, 1, 1),
12
+ enc_num_convs=(2, 2, 2, 2, 2),
13
+ dec_num_convs=(2, 2, 2, 2),
14
+ downsamples=(True, True, True, True),
15
+ enc_dilations=(1, 1, 1, 1, 1),
16
+ dec_dilations=(1, 1, 1, 1),
17
+ with_cp=False,
18
+ conv_cfg=None,
19
+ norm_cfg=norm_cfg,
20
+ act_cfg=dict(type='ReLU'),
21
+ upsample_cfg=dict(type='InterpConv'),
22
+ norm_eval=False),
23
+ decode_head=dict(
24
+ type='FCNHead',
25
+ in_channels=64,
26
+ in_index=4,
27
+ channels=64,
28
+ num_convs=1,
29
+ concat_input=False,
30
+ dropout_ratio=0.1,
31
+ num_classes=2,
32
+ norm_cfg=norm_cfg,
33
+ align_corners=False,
34
+ loss_decode=dict(
35
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
36
+ auxiliary_head=dict(
37
+ type='FCNHead',
38
+ in_channels=128,
39
+ in_index=3,
40
+ channels=64,
41
+ num_convs=1,
42
+ concat_input=False,
43
+ dropout_ratio=0.1,
44
+ num_classes=2,
45
+ norm_cfg=norm_cfg,
46
+ align_corners=False,
47
+ loss_decode=dict(
48
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
49
+ # model training and testing settings
50
+ train_cfg=dict(),
51
+ test_cfg=dict(mode='slide', crop_size=256, stride=170))
InternVL/segmentation/configs/_base_/models/gcnet_r50-d8.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ pretrained='open-mmlab://resnet50_v1c',
6
+ backbone=dict(
7
+ type='ResNetV1c',
8
+ depth=50,
9
+ num_stages=4,
10
+ out_indices=(0, 1, 2, 3),
11
+ dilations=(1, 1, 2, 4),
12
+ strides=(1, 2, 1, 1),
13
+ norm_cfg=norm_cfg,
14
+ norm_eval=False,
15
+ style='pytorch',
16
+ contract_dilation=True),
17
+ decode_head=dict(
18
+ type='GCHead',
19
+ in_channels=2048,
20
+ in_index=3,
21
+ channels=512,
22
+ ratio=1 / 4.,
23
+ pooling_type='att',
24
+ fusion_types=('channel_add', ),
25
+ dropout_ratio=0.1,
26
+ num_classes=19,
27
+ norm_cfg=norm_cfg,
28
+ align_corners=False,
29
+ loss_decode=dict(
30
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
31
+ auxiliary_head=dict(
32
+ type='FCNHead',
33
+ in_channels=1024,
34
+ in_index=2,
35
+ channels=256,
36
+ num_convs=1,
37
+ concat_input=False,
38
+ dropout_ratio=0.1,
39
+ num_classes=19,
40
+ norm_cfg=norm_cfg,
41
+ align_corners=False,
42
+ loss_decode=dict(
43
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
44
+ # model training and testing settings
45
+ train_cfg=dict(),
46
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/icnet_r50-d8.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
3
+ model = dict(
4
+ type='EncoderDecoder',
5
+ backbone=dict(
6
+ type='ICNet',
7
+ backbone_cfg=dict(
8
+ type='ResNetV1c',
9
+ in_channels=3,
10
+ depth=50,
11
+ num_stages=4,
12
+ out_indices=(0, 1, 2, 3),
13
+ dilations=(1, 1, 2, 4),
14
+ strides=(1, 2, 1, 1),
15
+ norm_cfg=norm_cfg,
16
+ norm_eval=False,
17
+ style='pytorch',
18
+ contract_dilation=True),
19
+ in_channels=3,
20
+ layer_channels=(512, 2048),
21
+ light_branch_middle_channels=32,
22
+ psp_out_channels=512,
23
+ out_channels=(64, 256, 256),
24
+ norm_cfg=norm_cfg,
25
+ align_corners=False,
26
+ ),
27
+ neck=dict(
28
+ type='ICNeck',
29
+ in_channels=(64, 256, 256),
30
+ out_channels=128,
31
+ norm_cfg=norm_cfg,
32
+ align_corners=False),
33
+ decode_head=dict(
34
+ type='FCNHead',
35
+ in_channels=128,
36
+ channels=128,
37
+ num_convs=1,
38
+ in_index=2,
39
+ dropout_ratio=0,
40
+ num_classes=19,
41
+ norm_cfg=norm_cfg,
42
+ concat_input=False,
43
+ align_corners=False,
44
+ loss_decode=dict(
45
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
46
+ auxiliary_head=[
47
+ dict(
48
+ type='FCNHead',
49
+ in_channels=128,
50
+ channels=128,
51
+ num_convs=1,
52
+ num_classes=19,
53
+ in_index=0,
54
+ norm_cfg=norm_cfg,
55
+ concat_input=False,
56
+ align_corners=False,
57
+ loss_decode=dict(
58
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
59
+ dict(
60
+ type='FCNHead',
61
+ in_channels=128,
62
+ channels=128,
63
+ num_convs=1,
64
+ num_classes=19,
65
+ in_index=1,
66
+ norm_cfg=norm_cfg,
67
+ concat_input=False,
68
+ align_corners=False,
69
+ loss_decode=dict(
70
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
71
+ ],
72
+ # model training and testing settings
73
+ train_cfg=dict(),
74
+ test_cfg=dict(mode='whole'))
InternVL/segmentation/configs/_base_/models/mask2former_beit.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model_cfg
2
+ num_things_classes = 100
3
+ num_stuff_classes = 50
4
+ num_classes = num_things_classes + num_stuff_classes
5
+ norm_cfg = dict(type='SyncBN', requires_grad=True)
6
+ model = dict(
7
+ type='EncoderDecoderMask2Former',
8
+ pretrained=None,
9
+ backbone=dict(
10
+ type='XCiT',
11
+ patch_size=16,
12
+ embed_dim=384,
13
+ depth=12,
14
+ num_heads=8,
15
+ mlp_ratio=4,
16
+ qkv_bias=True,
17
+ use_abs_pos_emb=True,
18
+ use_rel_pos_bias=False,
19
+ ),
20
+ decode_head=dict(
21
+ type='Mask2FormerHead',
22
+ in_channels=[256, 512, 1024, 2048], # pass to pixel_decoder inside
23
+ # strides=[4, 8, 16, 32],
24
+ feat_channels=256,
25
+ out_channels=256,
26
+ in_index=[0, 1, 2, 3],
27
+ num_things_classes=num_things_classes,
28
+ num_stuff_classes=num_stuff_classes,
29
+ num_queries=100,
30
+ num_transformer_feat_level=3,
31
+ pixel_decoder=dict(
32
+ type='MSDeformAttnPixelDecoder',
33
+ num_outs=3,
34
+ norm_cfg=dict(type='GN', num_groups=32),
35
+ act_cfg=dict(type='ReLU'),
36
+ encoder=dict(
37
+ type='DetrTransformerEncoder',
38
+ num_layers=6,
39
+ transformerlayers=dict(
40
+ type='BaseTransformerLayer',
41
+ attn_cfgs=dict(
42
+ type='MultiScaleDeformableAttention',
43
+ embed_dims=256,
44
+ num_heads=8,
45
+ num_levels=3,
46
+ num_points=4,
47
+ im2col_step=64,
48
+ dropout=0.0,
49
+ batch_first=False,
50
+ norm_cfg=None,
51
+ init_cfg=None),
52
+ ffn_cfgs=dict(
53
+ type='FFN',
54
+ embed_dims=256,
55
+ feedforward_channels=1024,
56
+ num_fcs=2,
57
+ ffn_drop=0.0,
58
+ act_cfg=dict(type='ReLU', inplace=True)),
59
+ operation_order=('self_attn', 'norm', 'ffn', 'norm')),
60
+ init_cfg=None),
61
+ positional_encoding=dict(
62
+ type='SinePositionalEncoding', num_feats=128, normalize=True),
63
+ init_cfg=None),
64
+ enforce_decoder_input_project=False,
65
+ positional_encoding=dict(
66
+ type='SinePositionalEncoding', num_feats=128, normalize=True),
67
+ transformer_decoder=dict(
68
+ type='DetrTransformerDecoder',
69
+ return_intermediate=True,
70
+ num_layers=9,
71
+ transformerlayers=dict(
72
+ type='DetrTransformerDecoderLayer',
73
+ attn_cfgs=dict(
74
+ type='MultiheadAttention',
75
+ embed_dims=256,
76
+ num_heads=8,
77
+ attn_drop=0.0,
78
+ proj_drop=0.0,
79
+ dropout_layer=None,
80
+ batch_first=False),
81
+ ffn_cfgs=dict(
82
+ embed_dims=256,
83
+ feedforward_channels=2048,
84
+ num_fcs=2,
85
+ act_cfg=dict(type='ReLU', inplace=True),
86
+ ffn_drop=0.0,
87
+ dropout_layer=None,
88
+ add_identity=True),
89
+ feedforward_channels=2048,
90
+ operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
91
+ 'ffn', 'norm')),
92
+ init_cfg=None),
93
+ loss_cls=dict(
94
+ type='CrossEntropyLoss',
95
+ use_sigmoid=False,
96
+ loss_weight=2.0,
97
+ reduction='mean',
98
+ class_weight=[1.0] * num_classes + [0.1]),
99
+ loss_mask=dict(
100
+ type='CrossEntropyLoss',
101
+ use_sigmoid=True,
102
+ reduction='mean',
103
+ loss_weight=5.0),
104
+ loss_dice=dict(
105
+ type='DiceLoss',
106
+ use_sigmoid=True,
107
+ activate=True,
108
+ reduction='mean',
109
+ naive_dice=True,
110
+ eps=1.0,
111
+ loss_weight=5.0)),
112
+ train_cfg=dict(
113
+ num_points=12544,
114
+ oversample_ratio=3.0,
115
+ importance_sample_ratio=0.75,
116
+ assigner=dict(
117
+ type='MaskHungarianAssigner',
118
+ cls_cost=dict(type='ClassificationCost', weight=2.0),
119
+ mask_cost=dict(
120
+ type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True),
121
+ dice_cost=dict(
122
+ type='DiceCost', weight=5.0, pred_act=True, eps=1.0)),
123
+ sampler=dict(type='MaskPseudoSampler')),
124
+ test_cfg=dict(
125
+ panoptic_on=True,
126
+ # For now, the dataset does not support
127
+ # evaluating semantic segmentation metric.
128
+ semantic_on=False,
129
+ instance_on=True,
130
+ # max_per_image is for instance segmentation.
131
+ max_per_image=100,
132
+ iou_thr=0.8,
133
+ # In Mask2Former's panoptic postprocessing,
134
+ # it will filter mask area where score is less than 0.5 .
135
+ filter_low_score=True),
136
+ init_cfg=None)
137
+
138
+ # find_unused_parameters = True