akuan123 commited on
Commit
78e1b3b
·
1 Parent(s): 81418e7

Upload 16 files

Browse files
.clang-format ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AccessModifierOffset: -1
2
+ AlignAfterOpenBracket: AlwaysBreak
3
+ AlignConsecutiveAssignments: false
4
+ AlignConsecutiveDeclarations: false
5
+ AlignEscapedNewlinesLeft: true
6
+ AlignOperands: false
7
+ AlignTrailingComments: false
8
+ AllowAllParametersOfDeclarationOnNextLine: false
9
+ AllowShortBlocksOnASingleLine: false
10
+ AllowShortCaseLabelsOnASingleLine: false
11
+ AllowShortFunctionsOnASingleLine: Empty
12
+ AllowShortIfStatementsOnASingleLine: false
13
+ AllowShortLoopsOnASingleLine: false
14
+ AlwaysBreakAfterReturnType: None
15
+ AlwaysBreakBeforeMultilineStrings: true
16
+ AlwaysBreakTemplateDeclarations: true
17
+ BinPackArguments: false
18
+ BinPackParameters: false
19
+ BraceWrapping:
20
+ AfterClass: false
21
+ AfterControlStatement: false
22
+ AfterEnum: false
23
+ AfterFunction: false
24
+ AfterNamespace: false
25
+ AfterObjCDeclaration: false
26
+ AfterStruct: false
27
+ AfterUnion: false
28
+ BeforeCatch: false
29
+ BeforeElse: false
30
+ IndentBraces: false
31
+ BreakBeforeBinaryOperators: None
32
+ BreakBeforeBraces: Attach
33
+ BreakBeforeTernaryOperators: true
34
+ BreakConstructorInitializersBeforeComma: false
35
+ BreakAfterJavaFieldAnnotations: false
36
+ BreakStringLiterals: false
37
+ ColumnLimit: 80
38
+ CommentPragmas: '^ IWYU pragma:'
39
+ ConstructorInitializerAllOnOneLineOrOnePerLine: true
40
+ ConstructorInitializerIndentWidth: 4
41
+ ContinuationIndentWidth: 4
42
+ Cpp11BracedListStyle: true
43
+ DerivePointerAlignment: false
44
+ DisableFormat: false
45
+ ForEachMacros: [ FOR_EACH, FOR_EACH_R, FOR_EACH_RANGE, ]
46
+ IncludeCategories:
47
+ - Regex: '^<.*\.h(pp)?>'
48
+ Priority: 1
49
+ - Regex: '^<.*'
50
+ Priority: 2
51
+ - Regex: '.*'
52
+ Priority: 3
53
+ IndentCaseLabels: true
54
+ IndentWidth: 2
55
+ IndentWrappedFunctionNames: false
56
+ KeepEmptyLinesAtTheStartOfBlocks: false
57
+ MacroBlockBegin: ''
58
+ MacroBlockEnd: ''
59
+ MaxEmptyLinesToKeep: 1
60
+ NamespaceIndentation: None
61
+ ObjCBlockIndentWidth: 2
62
+ ObjCSpaceAfterProperty: false
63
+ ObjCSpaceBeforeProtocolList: false
64
+ PenaltyBreakBeforeFirstCallParameter: 1
65
+ PenaltyBreakComment: 300
66
+ PenaltyBreakFirstLessLess: 120
67
+ PenaltyBreakString: 1000
68
+ PenaltyExcessCharacter: 1000000
69
+ PenaltyReturnTypeOnItsOwnLine: 200
70
+ PointerAlignment: Left
71
+ ReflowComments: true
72
+ SortIncludes: true
73
+ SpaceAfterCStyleCast: false
74
+ SpaceBeforeAssignmentOperators: true
75
+ SpaceBeforeParens: ControlStatements
76
+ SpaceInEmptyParentheses: false
77
+ SpacesBeforeTrailingComments: 1
78
+ SpacesInAngles: false
79
+ SpacesInContainerLiterals: true
80
+ SpacesInCStyleCastParentheses: false
81
+ SpacesInParentheses: false
82
+ SpacesInSquareBrackets: false
83
+ Standard: Cpp11
84
+ TabWidth: 8
85
+ UseTab: Never
.flake8 ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is an example .flake8 config, used when developing *Black* itself.
2
+ # Keep in sync with setup.cfg which is used for source packages.
3
+
4
+ [flake8]
5
+ ignore = W503, E203, E221, C901, C408, E741, C407, E741, B017
6
+ max-line-length = 100
7
+ max-complexity = 18
8
+ select = B,C,E,F,W,T4,B9
9
+ exclude = build
10
+ per-file-ignores =
11
+ **/__init__.py:F401,F403,E402
12
+ **/configs/**.py:F401,E402
13
+ configs/**.py:F401,E402
14
+ **/tests/config/**.py:F401,E402
15
+ tests/config/**.py:F401,E402
.gitignore ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # output dir
2
+ output
3
+ instant_test_output
4
+ inference_test_output
5
+ oai_clip_weights
6
+
7
+ pt_output
8
+ pretrained_ckpt
9
+ datasets/coco
10
+ datasets/lvis
11
+ datasets/cifar-10-batches-py
12
+
13
+ *.png
14
+ *.json
15
+ !vscode_launch.json
16
+ !/detectron2/data/classnames/*.json
17
+ *.diff
18
+ *.jpg
19
+ !/projects/DensePose/doc/images/*.jpg
20
+
21
+ # compilation and distribution
22
+ __pycache__
23
+ _ext
24
+ *.pyc
25
+ *.pyd
26
+ *.so
27
+ *.dll
28
+ *.egg-info/
29
+ build/
30
+ dist/
31
+ wheels/
32
+
33
+ # pytorch/python/numpy formats
34
+ *.pth
35
+ *.pkl
36
+ *.npy
37
+ *.ts
38
+ model_ts*.txt
39
+
40
+ # ipython/jupyter notebooks
41
+ *.ipynb
42
+ **/.ipynb_checkpoints/
43
+
44
+ # Editor temporaries
45
+ *.swn
46
+ *.swo
47
+ *.swp
48
+ *~
49
+
50
+ # editor settings
51
+ .idea
52
+ .vscode
53
+ _darcs
54
+
55
+ # project dirs
56
+ /detectron2/model_zoo/configs
57
+ /datasets/*
58
+ !/datasets/*.*
59
+ !/datasets/custom_images/
60
+ !/datasets/custom_images/*.jpg
61
+ !/docs
62
+ !/docs/*.*
63
+ /projects/*/datasets
64
+ /models
65
+ /snippet
LICENSE ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+
192
+ Licensed under the Apache License, Version 2.0 (the "License");
193
+ you may not use this file except in compliance with the License.
194
+ You may obtain a copy of the License at
195
+
196
+ http://www.apache.org/licenses/LICENSE-2.0
197
+
198
+ Unless required by applicable law or agreed to in writing, software
199
+ distributed under the License is distributed on an "AS IS" BASIS,
200
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
+ See the License for the specific language governing permissions and
202
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RegionCLIP: Region-based Language-Image Pretraining
2
+
3
+ This is the official PyTorch implementation of RegionCLIP (CVPR 2022).
4
+
5
+ [**Paper**](https://arxiv.org/abs/2112.09106) | [**Demo on Hugging Face**](https://huggingface.co/spaces/CVPR/regionclip-demo) | [**Slides**](https://drive.google.com/file/d/1EepNVJGo_d73Glr4vNjR4Av0dNkBCGcj/view?usp=sharing)
6
+
7
+ > **RegionCLIP: Region-based Language-Image Pretraining (CVPR 2022)** <br>
8
+ > [Yiwu Zhong](https://pages.cs.wisc.edu/~yiwuzhong/), [Jianwei Yang](https://jwyang.github.io/), [Pengchuan Zhang](https://pzzhang.github.io/pzzhang/), [Chunyuan Li](https://chunyuan.li/), [Noel Codella](https://noelcodella.github.io/publicwebsite/), [Liunian Li](https://liunian-harold-li.github.io/), [Luowei Zhou](https://luoweizhou.github.io/), [Xiyang Dai](https://sites.google.com/site/xiyangdai/), [Lu Yuan](https://scholar.google.com/citations?user=k9TsUVsAAAAJ&hl=en), [Yin Li](https://www.biostat.wisc.edu/~yli/), and [Jianfeng Gao](https://www.microsoft.com/en-us/research/people/jfgao/?from=http%3A%2F%2Fresearch.microsoft.com%2Fen-us%2Fum%2Fpeople%2Fjfgao%2F) <br>
9
+
10
+ <p align="center">
11
+ <img src="docs/regionclip.png" width=80% height=80%
12
+ class="center">
13
+ </p>
14
+
15
+ ## Overview
16
+
17
+ We propose RegionCLIP that significantly extends CLIP to learn region-level visual representations. RegionCLIP enables fine-grained alignment between image regions and textual concepts, and thus supports region-based reasoning tasks including zero-shot object detection and open-vocabulary object detection.
18
+
19
+ - **Pretraining**: We leverage a CLIP model to match image regions with template captions, and then pretrain our model to align these region-text pairs.
20
+ - **Zero-shot inference**: Once pretrained, the learned region representations support zero-shot inference for object detection.
21
+ - **Transfer learning**: The learned RegionCLIP model can be further fine-tuned with additional object detection annotations, allowing our model to be used for fully supervised or open-vocabulary object detection.
22
+ - **Results**: Our method demonstrates **state-of-the-art** results for zero-shot object detection and open-vocabulary object detection.
23
+
24
+ ## Updates
25
+ * :collision: [10/05/2022] RegionCLIP now supports not only resnet but also many vision transformers (e.g., vit, swin, davit, focalnet) for zero-shot object detection! Please checkout the [zero-shot branch](https://github.com/microsoft/RegionCLIP/tree/zero-shot)!
26
+ * [09/23/2022] As requested by researchers, we release the [configs](configs/pretrain) and [scripts](pretrain.sh) of pre-training. A full tutorial and pre-training data will be released later. Stay tuned!
27
+ * [09/18/2022] Organizing ECCV Workshop [Computer Vision in the Wild (CVinW)](https://computer-vision-in-the-wild.github.io/eccv-2022/), where two challenges are hosted to evaluate the zero-shot, few-shot and full-shot performance of pre-trained vision models in downstream tasks:
28
+ - [Image Classification in the Wild (ICinW)](https://eval.ai/web/challenges/challenge-page/1832/overview) Challenge evaluates on 20 image classification tasks.
29
+ - [Object Detection in the Wild (ODinW)](https://eval.ai/web/challenges/challenge-page/1839/overview) Challenge evaluates on 35 object detection tasks.
30
+ * [07/11/2022] We included the scripts for concept feature extraction. It can be used for your own costomized concept pool!
31
+ * [07/07/2022] We included the scripts for region feature extraction. The extracted visual features can be used for various downstream tasks!
32
+ * [06/24/2022] We released [**a Web demo using Gradio on Hugging Face**](https://huggingface.co/spaces/CVPR/regionclip-demo). It uses our pretrained RegionCLIP for zero-shot inference. Check it out!
33
+ * [06/20/2022] We released models and inference code for our RegionCLIP!
34
+
35
+ ## Outline
36
+
37
+ 1. [Installation](#Installation)
38
+ 2. [Datasets](#Datasets)
39
+ 3. [Model Zoo](#Model-Zoo)
40
+ 4. [Zero-shot Inference](#Zero-shot-Inference)
41
+ 5. [Transfer Learning](#Transfer-Learning)
42
+ 6. [Extract Region Features](#Extract-Region-Features)
43
+ 7. [Extract Concept Features](#Extract-Concept-Features)
44
+ 8. [Citation and Acknowledgement](#Citation-and-Acknowledgement)
45
+ 9. [Contributing](#Contributing)
46
+
47
+ ## Installation
48
+
49
+ Check [`INSTALL.md`](docs/INSTALL.md) for installation instructions.
50
+
51
+ ## Datasets
52
+
53
+ Check [`datasets/README.md`](datasets/README.md) for dataset preparation.
54
+
55
+ ## Model Zoo
56
+
57
+ Check [`MODEL_ZOO.md`](docs/MODEL_ZOO.md) for our pretrained models.
58
+
59
+
60
+ ## Zero-shot Inference
61
+
62
+ After pretraining, RegionCLIP can directly support the challenging zero-shot object detection task **without finetuning on detection annotation**. Given an input image, our pretrained RegionCLIP can match image region features to object concept embeddings, and thus recognize image regions into many object categories. The image regions are produced by a region localizer (e.g., RPN), where the object class names come from a dictionary **specifiied by users**.
63
+
64
+
65
+ ### Visualization on custom images
66
+
67
+ We provide an example below for zero-shot object detection with pretrained RegionCLIP on custom images and for visualizing the results.
68
+
69
+ <details>
70
+
71
+ <summary>
72
+ Before detecting objects, please prepare pretrained models, label files, and the custom images. See details below.
73
+ </summary>
74
+
75
+ - Check [`MODEL_ZOO.md`](docs/MODEL_ZOO.md) to
76
+ - download the pretrained model checkpoint `regionclip_pretrained-cc_rn50x4.pth` (RegionCLIP with ResNet50x4) to the folder `./pretrained_ckpt/regionclip`.
77
+ - download the class embeddings `lvis_1203_cls_emb_rn50x4.pth` to the folder `./pretrained_ckpt/concept_emb`.
78
+ - Check [`datasets/README.md`](datasets/README.md) to download LVIS label file `lvis_v1_val.json` and put it in the folder `./datasets/lvis/lvis_v1_val.json`. The file is used to specify object class names.
79
+ - Put all custom images in the folder `./datasets/custom_images/`.
80
+
81
+ </details>
82
+
83
+ <details>
84
+
85
+ <summary>
86
+ After preparation, run the following script to detect objects.
87
+ </summary>
88
+
89
+ ```
90
+ python3 ./tools/train_net.py \
91
+ --eval-only \
92
+ --num-gpus 1 \
93
+ --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_custom_img.yaml \
94
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
95
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
96
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
97
+ MODEL.CLIP.TEXT_EMB_DIM 640 \
98
+ MODEL.RESNETS.DEPTH 200 \
99
+ MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
100
+ ```
101
+
102
+ </details>
103
+
104
+ <details>
105
+
106
+ <summary>
107
+ The detection results will be stored as the file "./output/inference/lvis_instances_results.json". To visualize it, run the script below.
108
+ </summary>
109
+
110
+ ```
111
+ python ./tools/visualize_json_results.py \
112
+ --input ./output/inference/lvis_instances_results.json \
113
+ --output ./output/regions \
114
+ --dataset lvis_v1_val_custom_img \
115
+ --conf-threshold 0.05 \
116
+ --show-unique-boxes \
117
+ --max-boxes 25 \
118
+ --small-region-px 8100\
119
+ ```
120
+ </details>
121
+
122
+ The visualized images will be placed at `./output/regions/`. The visualized images would look like:
123
+
124
+ <p align="center">
125
+ <img src="docs/sample_img1_vis.jpg" width=80% height=80%
126
+ class="center">
127
+ </p>
128
+
129
+ In this example, the detection results come from our pretrained RegionCLIP with ResNet50x4 architecture. The regions are proposed by an RPN trained by 866 object categories from LVIS dataset. For now, we use 1203 object class names from LVIS dataset for this visualization example. We also include an example in `visualize_zeroshot_inference.sh` with our pretrained RegionCLIP (ResNet50 architecture).
130
+
131
+
132
+ ### Evaluation for zero-shot inference
133
+
134
+ We provide an example below for evaluating our pretrained RegionCLIP (ResNet50) using ground-truth boxes on COCO dataset. This will reproduce our results in Table 4 of the paper.
135
+
136
+ <details>
137
+
138
+ <summary>
139
+ Before evaluation, please prepare pretrained models and set up the dataset.
140
+ </summary>
141
+
142
+ - Check [`MODEL_ZOO.md`](docs/MODEL_ZOO.md) to
143
+ - download the pretrained RegionCLIP checkpoint `regionclip_pretrained-cc_rn50.pth` to the folder `./pretrained_ckpt/regionclip`.
144
+ - download the class embeddings `coco_65_cls_emb.pth` to the folder `./pretrained_ckpt/concept_emb`.
145
+ - Check [`datasets/README.md`](datasets/README.md) to set up COCO dataset.
146
+
147
+ </details>
148
+
149
+ <details>
150
+
151
+ <summary>
152
+ After preparation, run the following script to evaluate the pretrained model in zero-shot inference setting.
153
+ </summary>
154
+
155
+ ```
156
+ python3 ./tools/train_net.py \
157
+ --eval-only \
158
+ --num-gpus 1 \
159
+ --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml \
160
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
161
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
162
+ MODEL.CLIP.CROP_REGION_TYPE GT \
163
+ MODEL.CLIP.MULTIPLY_RPN_SCORE False \
164
+ ```
165
+
166
+ </details>
167
+
168
+ For more examples, please refer to `test_zeroshot_inference.sh`. This script covers a wide combination of pretrained models (ResNet50, ResNet50x4), datasets (COCO, LVIS) and region proposal types (ground-truth regions, RPN proposals). Also, please refer to [MODEL_ZOO.md](docs/MODEL_ZOO.md) for available trained models and [`datasets/README.md`](datasets/README.md) for setting up COCO and LVIS datasets.
169
+
170
+ ## Transfer Learning
171
+
172
+ Our pretrained RegionCLIP can be further **fine-tuned** when human annotations of objects are available. In this transfer learning setting, we demonstrate results on **open-vocabulary object detection**, where the object detector is trained on base categories and evaluated on both base and **novel** categories.
173
+
174
+ We show an example for running a trained detector on custom images. Further, we provide scripts of training and evaluation for the benchmark of **open-vocabulary object detection**, including COCO and LVIS datasets (Table 1 & 2 in paper).
175
+
176
+
177
+ ### Visualization on custom images
178
+
179
+ We provide an example below for running a trained open-vocabulary object detector on custom images and for visualizing the results. In this example, the detector is initialized using RegionCLIP (RN50x4), trained on 866 LVIS base categories, and is tasked to detect all 1203 categories on LVIS.
180
+
181
+ <details>
182
+
183
+ <summary>
184
+ Before detecting objects, please prepare the trained detectors, label files, and the custom images.
185
+ </summary>
186
+
187
+ - Check [`MODEL_ZOO.md`](docs/MODEL_ZOO.md) to
188
+ - download the trained detector checkpoint `regionclip_finetuned-lvis_rn50x4.pth` to the folder `./pretrained_ckpt/regionclip`.
189
+ - download the trained RPN checkpoint `rpn_lvis_866_lsj.pth` to the folder `./pretrained_ckpt/rpn`.
190
+ - download the class embeddings `lvis_1203_cls_emb_rn50x4.pth` to the folder `./pretrained_ckpt/concept_emb`.
191
+ - Check [`datasets/README.md`](datasets/README.md) to download label file `lvis_v1_val.json` and put it in the folder `./datasets/lvis/lvis_v1_val.json`.
192
+ - Put all custom images in the folder `./datasets/custom_images/`.
193
+
194
+ </details>
195
+
196
+ <details>
197
+
198
+ <summary>
199
+ After preparation, run the following script to detect objects and visualize the results.
200
+ </summary>
201
+
202
+ ```
203
+ # for simplicity, we integrate the script in visualize_transfer_learning.sh
204
+ bash visualize_transfer_learning.sh
205
+ ```
206
+
207
+ </details>
208
+
209
+
210
+ The visualized images will be placed at `./output/regions/`.
211
+
212
+
213
+ ### Evaluate the trained detectors
214
+
215
+ We provide an example below for evaluating our open-vocabulary object detector, initialized by RegionCLIP (ResNet50) and trained on COCO dataset.
216
+
217
+ <details>
218
+
219
+ <summary>
220
+ Before evaluation, please prepare the trained detector and set up the dataset.
221
+ </summary>
222
+
223
+ - Check [`MODEL_ZOO.md`](docs/MODEL_ZOO.md) to
224
+ - download the trained detector checkpoint `regionclip_finetuned-coco_rn50.pth` to the folder `./pretrained_ckpt/regionclip`,
225
+ - download the trained RPN checkpoint `rpn_coco_48.pth` to the folder `./pretrained_ckpt/rpn`,
226
+ - download the class embeddings `coco_48_base_cls_emb.pth` and `coco_65_cls_emb.pth` to the folder `./pretrained_ckpt/concept_emb`.
227
+ - Check [`datasets/README.md`](datasets/README.md) to set up COCO dataset.
228
+
229
+ </details>
230
+
231
+ <details>
232
+
233
+ <summary>
234
+ After preparation, run the following script to evaluate the trained open-vocabulary detector.
235
+ </summary>
236
+
237
+ ```
238
+ python3 ./tools/train_net.py \
239
+ --eval-only \
240
+ --num-gpus 1 \
241
+ --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd.yaml \
242
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_finetuned-coco_rn50.pth \
243
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \
244
+ MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \
245
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb.pth \
246
+ MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
247
+ MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \
248
+ ```
249
+
250
+ </details>
251
+
252
+
253
+ For more examples, please refer to `test_transfer_learning.sh`. This script includes benchmark evaluation for various combination of trained detectors (ResNet50, ResNet50x4) and datasets (COCO, LVIS). Also, please refer to [MODEL_ZOO.md](docs/MODEL_ZOO.md) for available trained models and [`datasets/README.md`](datasets/README.md) for setting up COCO and LVIS datasets.
254
+
255
+
256
+ ### Train detectors on your own
257
+
258
+ We provide an example below for training an open-vocabulary object detector on COCO dataset, with pretrained RegionCLIP (ResNet50) as the initialization.
259
+
260
+ <details>
261
+
262
+ <summary>
263
+ Before training, please prepare our pretrained RegionCLIP model and set up the dataset.
264
+ </summary>
265
+
266
+ - Check [`MODEL_ZOO.md`](docs/MODEL_ZOO.md) to
267
+ - download the pretrained RegionCLIP checkpoint `regionclip_pretrained-cc_rn50.pth` to the folder `./pretrained_ckpt/regionclip`,
268
+ - download the trained RPN checkpoint `rpn_coco_48.pth` to the folder `./pretrained_ckpt/rpn`,
269
+ - download the class embeddings `coco_48_base_cls_emb.pth` and `coco_65_cls_emb.pth` to the folder `./pretrained_ckpt/concept_emb`.
270
+ - Check [`datasets/README.md`](datasets/README.md) to set up COCO dataset.
271
+
272
+ </details>
273
+
274
+ <details>
275
+
276
+ <summary>
277
+ After preparation, run the following script to train an open-vocabulary detector.
278
+ </summary>
279
+
280
+ ```
281
+ python3 ./tools/train_net.py \
282
+ --num-gpus 1 \
283
+ --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd.yaml \
284
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
285
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \
286
+ MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \
287
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb.pth \
288
+ MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
289
+ ```
290
+
291
+ </details>
292
+
293
+ For more examples, please refer to `train_transfer_learning.sh`. This script provides training scripts for various combination of detector backbones (ResNet50, ResNet50x4) and datasets (COCO, LVIS). Also, please refer to [MODEL_ZOO.md](docs/MODEL_ZOO.md) for available trained models and [`datasets/README.md`](datasets/README.md) for setting up COCO and LVIS datasets.
294
+
295
+
296
+ ## Extract Region Features
297
+
298
+ We provide scripts for extracting region features from our pre-trained RegionCLIP. Given a folder of images, our scripts extract region features (along with other detection results such as box coordinates) and save them as local files.
299
+
300
+ The following is an example using pretrained RegionCLIP with ResNet50. We extend the scripts from zero-shot inference (section above) with minor changes, such as the input and output folders.
301
+
302
+ <details>
303
+
304
+ <summary>
305
+ The following is a brief introduction for the settings.
306
+ </summary>
307
+
308
+ We enable feature extraction for two types of regions:
309
+
310
+ - RPN regions: This setting is class-agnostic. The regions are the top-scored RPN proposals.
311
+
312
+ - Detection regions: This setting requires additional input as a concept embedding file (the concepts of interests). The regions are the final detection output boxes (after 2nd-stage NMS). As a reference, the [Bottom-Up features](https://openaccess.thecvf.com/content_cvpr_2018/papers/Anderson_Bottom-Up_and_Top-Down_CVPR_2018_paper.pdf) (widely-used in vision-language tasks) also come from the final detection boxes.
313
+
314
+ </details>
315
+
316
+
317
+
318
+ <details>
319
+
320
+ <summary>
321
+ Before running scripts, please prepare pretrained models and your custom images.
322
+ </summary>
323
+
324
+ - Check [`MODEL_ZOO.md`](docs/MODEL_ZOO.md) to
325
+ - download the pretrained RegionCLIP checkpoint `regionclip_pretrained-cc_rn50.pth` to the folder `./pretrained_ckpt/regionclip`.
326
+ - download the trained RPN checkpoint `rpn_lvis_866.pth` to the folder `./pretrained_ckpt/rpn`.
327
+ - (optional) if you want to extract features of the boxes detected for 1203 LVIS concepts, download the class embeddings `lvis_1203_cls_emb.pth` to the folder `./pretrained_ckpt/concept_emb`.
328
+ - Put all custom images in a folder. It can be specified in the script (check `INPUT_DIR` below).
329
+
330
+
331
+ </details>
332
+
333
+
334
+ <details>
335
+
336
+ <summary>
337
+ After preparation, run the following script to extract region features.
338
+ </summary>
339
+
340
+ The following script extracts features from **RPN regions**.
341
+ ```
342
+ # RN50, features of RPN regions
343
+ python3 ./tools/extract_region_features.py \
344
+ --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
345
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
346
+ MODEL.CLIP.CROP_REGION_TYPE RPN \
347
+ MODEL.CLIP.MULTIPLY_RPN_SCORE True \
348
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
349
+ MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
350
+ INPUT_DIR ./datasets/custom_images \
351
+ OUTPUT_DIR ./output/region_feats \
352
+ MODEL.CLIP.OFFLINE_RPN_POST_NMS_TOPK_TEST 100 \
353
+ ```
354
+
355
+ The following script extracts features from **detection regions** (after 2nd-stage NMS).
356
+
357
+ ```
358
+ # You can simply run "bash extract_region_features.sh"
359
+ python3 ./tools/extract_region_features.py \
360
+ --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
361
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
362
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
363
+ MODEL.CLIP.CROP_REGION_TYPE RPN \
364
+ MODEL.CLIP.MULTIPLY_RPN_SCORE True \
365
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
366
+ MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
367
+ INPUT_DIR ./datasets/custom_images \
368
+ OUTPUT_DIR ./output/region_feats \
369
+ TEST.DETECTIONS_PER_IMAGE 100 \
370
+ ```
371
+
372
+ The region features of each image will be saved into a `.pth` file in the folder `OUTPUT_DIR`. For simplicity, the current script only supports single GPU inference. As a reference, it takes roughly 0.76 seconds on single Titan-Xp GPU with RegionCLIP-ResNet50 and 1203 LVIS object concepts.
373
+
374
+ The following is a list of key arguments for feature extraction. You can specify them in the script as needed.
375
+
376
+ - `INPUT_DIR` and `OUTPUT_DIR`: specify a folder of input images and an output folder where region features will be saved, respectively.
377
+
378
+ - `MODEL.CLIP.BB_RPN_WEIGHTS`: specifies which trained RPN to use. You can replace it as needed. For more details, please check [`MODEL_ZOO.md`](docs/MODEL_ZOO.md).
379
+
380
+ - `MODEL.CLIP.TEXT_EMB_PATH` (optional): specifies which object concept embedding file to use. The selection of concepts will affect the per-class NMS (2nd stage) and thus final output boxes.
381
+
382
+ - `TEST.DETECTIONS_PER_IMAGE`: defines the number of final output regions (e.g., default value is 100 in COCO configs and 300 in LVIS configs)
383
+
384
+ - `MODEL.CLIP.OFFLINE_RPN_POST_NMS_TOPK_TEST`: defines the number of region proposals from RPN (e.g., default is 1000). Lowering this value can significantly reduce inference time and memory cost, but might affect the final detection quality.
385
+
386
+ - `MODEL.CLIP.OFFLINE_RPN_NMS_THRESH` and `MODEL.ROI_HEADS.NMS_THRESH_TEST`: control the NMS IoU thresholds in RPN (1st stage, default is 0.9) and prediction head (2nd stage, default is 0.5), respectively. If you extract features using RPN regions, you might want to change `MODEL.CLIP.OFFLINE_RPN_NMS_THRESH` as needed.
387
+
388
+ </details>
389
+
390
+ ## Extract Concept Features
391
+
392
+ Along with the region feature extraction, we also provide scripts for extracting concept features from our pre-trained RegionCLIP. Given a list of concepts, our scripts extract textual embeddings and save them as local files. The following is an example using pretrained RegionCLIP. We extend the scripts from region feature extraction (section above) with minor changes.
393
+
394
+
395
+ <details>
396
+
397
+ <summary>
398
+ Before running scripts, please prepare pretrained models and your custom concepts.
399
+ </summary>
400
+
401
+ - Check [`MODEL_ZOO.md`](docs/MODEL_ZOO.md) to
402
+ - download the pretrained RegionCLIP checkpoint `regionclip_pretrained-cc_rn50.pth` to the folder `./pretrained_ckpt/regionclip`.
403
+ - Put all concepts in the file `concepts.txt` with each line as a concept name. Place this file in a folder which can be specified in the script (check `INPUT_DIR` below).
404
+
405
+
406
+ </details>
407
+
408
+
409
+ <details>
410
+
411
+ <summary>
412
+ After preparation, run the following script to extract region features.
413
+ </summary>
414
+
415
+ The following script extracts features from ResNet50.
416
+ ```
417
+ # RN50 concept embeddings
418
+ python3 ./tools/extract_concept_features.py \
419
+ --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
420
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
421
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
422
+ INPUT_DIR ./datasets/custom_concepts \
423
+ OUTPUT_DIR ./output/concept_feats \
424
+ MODEL.CLIP.GET_CONCEPT_EMB True \
425
+ ```
426
+
427
+ And for ResNet50x4, use the following command:
428
+ ```
429
+ # RN50x4 concept embeddings
430
+ python3 ./tools/extract_concept_features.py \
431
+ --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
432
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
433
+ MODEL.CLIP.TEXT_EMB_DIM 640 \
434
+ MODEL.RESNETS.DEPTH 200 \
435
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
436
+ INPUT_DIR ./datasets/custom_concepts \
437
+ OUTPUT_DIR ./output/concept_feats \
438
+ MODEL.CLIP.GET_CONCEPT_EMB True \
439
+ ```
440
+
441
+ The language embeddings of all concepts will be saved into a `.pth` file in the folder `OUTPUT_DIR`. These language embeddings have not been normalized yet, for the consistency with concept embeddings provided in [`MODEL_ZOO.md`](docs/MODEL_ZOO.md).
442
+
443
+ The following is a list of key arguments for feature extraction. You can specify them in the script as needed.
444
+
445
+ - `INPUT_DIR` and `OUTPUT_DIR`: specify a folder of input concepts and an output folder where region features will be saved, respectively.
446
+
447
+ </details>
448
+
449
+ ## Citation and Acknowledgement
450
+
451
+ ### Citation
452
+
453
+ If you find this repo useful, please consider citing our paper:
454
+
455
+ ```
456
+ @inproceedings{zhong2022regionclip,
457
+ title={Regionclip: Region-based language-image pretraining},
458
+ author={Zhong, Yiwu and Yang, Jianwei and Zhang, Pengchuan and Li, Chunyuan and Codella, Noel and Li, Liunian Harold and Zhou, Luowei and Dai, Xiyang and Yuan, Lu and Li, Yin and others},
459
+ booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
460
+ pages={16793--16803},
461
+ year={2022}
462
+ }
463
+ ```
464
+
465
+ ### Acknowledgement
466
+
467
+ This repository was built on top of [Detectron2](https://github.com/facebookresearch/detectron2), [CLIP](https://github.com/openai/CLIP), [OVR-CNN](https://github.com/alirezazareian/ovr-cnn), and [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark). We thank the effort from our community.
468
+
469
+ ## Contributing
470
+
471
+ This project welcomes contributions and suggestions. Most contributions require you to agree to a
472
+ Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
473
+ the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
474
+
475
+ When you submit a pull request, a CLA bot will automatically determine whether you need to provide
476
+ a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
477
+ provided by the bot. You will only need to do this once across all repos using our CLA.
478
+
479
+ This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
480
+ For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
481
+ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
SECURITY.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
2
+
3
+ ## Security
4
+
5
+ Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6
+
7
+ If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8
+
9
+ ## Reporting Security Issues
10
+
11
+ **Please do not report security vulnerabilities through public GitHub issues.**
12
+
13
+ Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14
+
15
+ If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16
+
17
+ You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18
+
19
+ Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20
+
21
+ * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22
+ * Full paths of source file(s) related to the manifestation of the issue
23
+ * The location of the affected source code (tag/branch/commit or direct URL)
24
+ * Any special configuration required to reproduce the issue
25
+ * Step-by-step instructions to reproduce the issue
26
+ * Proof-of-concept or exploit code (if possible)
27
+ * Impact of the issue, including how an attacker might exploit the issue
28
+
29
+ This information will help us triage your report more quickly.
30
+
31
+ If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32
+
33
+ ## Preferred Languages
34
+
35
+ We prefer all communications to be in English.
36
+
37
+ ## Policy
38
+
39
+ Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40
+
41
+ <!-- END MICROSOFT SECURITY.MD BLOCK -->
extract_concept_features.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Extract concept features for a list of concepts
2
+
3
+ # RN50 concept embeddings
4
+ python3 ./tools/extract_concept_features.py \
5
+ --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
6
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
7
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
8
+ INPUT_DIR ./datasets/custom_concepts \
9
+ OUTPUT_DIR ./output/concept_feats \
10
+ MODEL.CLIP.GET_CONCEPT_EMB True \
11
+
12
+ # RN50x4 concept embeddings
13
+ # python3 ./tools/extract_concept_features.py \
14
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
15
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
16
+ # MODEL.CLIP.TEXT_EMB_DIM 640 \
17
+ # MODEL.RESNETS.DEPTH 200 \
18
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
19
+ # INPUT_DIR ./datasets/custom_concepts \
20
+ # OUTPUT_DIR ./output/concept_feats \
21
+ # MODEL.CLIP.GET_CONCEPT_EMB True \
extract_region_features.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Extract region features for a folder of images
2
+
3
+ # RN50, LVIS 1203 concepts
4
+ python3 ./tools/extract_region_features.py \
5
+ --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
6
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
7
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
8
+ MODEL.CLIP.CROP_REGION_TYPE RPN \
9
+ MODEL.CLIP.MULTIPLY_RPN_SCORE True \
10
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
11
+ MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
12
+ INPUT_DIR ./datasets/custom_images \
13
+ OUTPUT_DIR ./output/region_feats \
14
+ TEST.DETECTIONS_PER_IMAGE 100 \
15
+
16
+ # # RN50, features of RPN proposals
17
+ # python3 ./tools/extract_region_features.py \
18
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
19
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
20
+ # MODEL.CLIP.CROP_REGION_TYPE RPN \
21
+ # MODEL.CLIP.MULTIPLY_RPN_SCORE True \
22
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
23
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
24
+ # INPUT_DIR ./datasets/custom_images \
25
+ # OUTPUT_DIR ./output/region_feats \
26
+ # MODEL.CLIP.OFFLINE_RPN_POST_NMS_TOPK_TEST 100 \
27
+
28
+ # # RN50x4, LVIS 1203 concepts
29
+ # python3 ./tools/extract_region_features.py \
30
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
31
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
32
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
33
+ # MODEL.CLIP.CROP_REGION_TYPE RPN \
34
+ # MODEL.CLIP.MULTIPLY_RPN_SCORE True \
35
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
36
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
37
+ # MODEL.CLIP.TEXT_EMB_DIM 640 \
38
+ # MODEL.RESNETS.DEPTH 200 \
39
+ # MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
40
+ # INPUT_DIR ./datasets/custom_images \
41
+ # OUTPUT_DIR ./output/region_feats \
42
+ # TEST.DETECTIONS_PER_IMAGE 100 \
pretrain.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NOTE: The pre-training section is still under construction.
2
+ # The pre-training code was already released (PretrainFastRCNN class).
3
+ # Now we release the config files and scripts (un-tested yet), as requested by researchers.
4
+ # We will release the pre-training data (image-text pairs) in near future.
5
+
6
+
7
+ # Distributed training across multiple nodes
8
+ # ResNet50 (default: batch 96, lr 0.002, 32 GPUs)
9
+ python3 -m launch --nnodes=2 --nproc_per_node=16 --master_port 12345 ./tools/train_net.py \
10
+ --num-gpus 16 \
11
+ --config-file ./configs/pretrain/RegionCLIP_RN50.yaml \
12
+ MODEL.WEIGHTS ./pretrained_ckpt/clip/teacher_RN50_student_RN50_OAI_CLIP.pth \
13
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
14
+ MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
15
+ MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/coco_nouns_4764_emb.pth \
16
+ OUTPUT_DIR ./output/pretrain \
17
+
18
+
19
+ # ResNet50x4 (default: batch 96, lr 0.002, 32 GPUs)
20
+ python3 -m launch --nnodes=2 --nproc_per_node=16 --master_port 12345 ./tools/train_net.py \
21
+ --num-gpus 16 \
22
+ --config-file ./configs/pretrain/RegionCLIP_RN50x4.yaml \
23
+ MODEL.WEIGHTS ./pretrained_ckpt/clip/teacher_RN50x4_student_RN50x4_OAI_CLIP.pth \
24
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
25
+ MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
26
+ MODEL.CLIP.CONCEPT_POOL_EMB ./pretrained_ckpt/concept_emb/coco_nouns_4764_emb_rn50x4.pth \
27
+ OUTPUT_DIR ./output/pretrain \
setup.cfg ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [isort]
2
+ line_length=100
3
+ multi_line_output=3
4
+ include_trailing_comma=True
5
+ known_standard_library=numpy,setuptools,mock
6
+ skip=./datasets,docs
7
+ skip_glob=*/__init__.py,**/configs/**,tests/config/**
8
+ known_myself=detectron2
9
+ known_third_party=fvcore,matplotlib,cv2,torch,torchvision,PIL,pycocotools,yacs,termcolor,cityscapesscripts,tabulate,tqdm,scipy,lvis,psutil,pkg_resources,caffe2,onnx,panopticapi,black,isort,av,iopath,omegaconf,hydra,yaml,pydoc,submitit,cloudpickle
10
+ no_lines_before=STDLIB,THIRDPARTY
11
+ sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER
12
+ default_section=FIRSTPARTY
13
+
14
+ [mypy]
15
+ python_version=3.6
16
+ ignore_missing_imports = True
17
+ warn_unused_configs = True
18
+ disallow_untyped_defs = True
19
+ check_untyped_defs = True
20
+ warn_unused_ignores = True
21
+ warn_redundant_casts = True
22
+ show_column_numbers = True
23
+ follow_imports = silent
24
+ allow_redefinition = True
25
+ ; Require all functions to be annotated
26
+ disallow_incomplete_defs = True
setup.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+
4
+ import glob
5
+ import os
6
+ import shutil
7
+ from os import path
8
+ from setuptools import find_packages, setup
9
+ from typing import List
10
+ import torch
11
+ from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
12
+ from torch.utils.hipify import hipify_python
13
+
14
+ torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
15
+ assert torch_ver >= [1, 6], "Requires PyTorch >= 1.6"
16
+
17
+
18
+ def get_version():
19
+ init_py_path = path.join(path.abspath(path.dirname(__file__)), "detectron2", "__init__.py")
20
+ init_py = open(init_py_path, "r").readlines()
21
+ version_line = [l.strip() for l in init_py if l.startswith("__version__")][0]
22
+ version = version_line.split("=")[-1].strip().strip("'\"")
23
+
24
+ # The following is used to build release packages.
25
+ # Users should never use it.
26
+ suffix = os.getenv("D2_VERSION_SUFFIX", "")
27
+ version = version + suffix
28
+ if os.getenv("BUILD_NIGHTLY", "0") == "1":
29
+ from datetime import datetime
30
+
31
+ date_str = datetime.today().strftime("%y%m%d")
32
+ version = version + ".dev" + date_str
33
+
34
+ new_init_py = [l for l in init_py if not l.startswith("__version__")]
35
+ new_init_py.append('__version__ = "{}"\n'.format(version))
36
+ with open(init_py_path, "w") as f:
37
+ f.write("".join(new_init_py))
38
+ return version
39
+
40
+
41
+ def get_extensions():
42
+ this_dir = path.dirname(path.abspath(__file__))
43
+ extensions_dir = path.join(this_dir, "detectron2", "layers", "csrc")
44
+
45
+ main_source = path.join(extensions_dir, "vision.cpp")
46
+ sources = glob.glob(path.join(extensions_dir, "**", "*.cpp"))
47
+
48
+ from torch.utils.cpp_extension import ROCM_HOME
49
+
50
+ is_rocm_pytorch = (
51
+ True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
52
+ )
53
+
54
+ hipify_ver = (
55
+ [int(x) for x in torch.utils.hipify.__version__.split(".")]
56
+ if hasattr(torch.utils.hipify, "__version__")
57
+ else [0, 0, 0]
58
+ )
59
+
60
+ if is_rocm_pytorch and hipify_ver < [1, 0, 0]: # TODO not needed since pt1.8
61
+
62
+ # Earlier versions of hipification and extension modules were not
63
+ # transparent, i.e. would require an explicit call to hipify, and the
64
+ # hipification would introduce "hip" subdirectories, possibly changing
65
+ # the relationship between source and header files.
66
+ # This path is maintained for backwards compatibility.
67
+
68
+ hipify_python.hipify(
69
+ project_directory=this_dir,
70
+ output_directory=this_dir,
71
+ includes="/detectron2/layers/csrc/*",
72
+ show_detailed=True,
73
+ is_pytorch_extension=True,
74
+ )
75
+
76
+ source_cuda = glob.glob(path.join(extensions_dir, "**", "hip", "*.hip")) + glob.glob(
77
+ path.join(extensions_dir, "hip", "*.hip")
78
+ )
79
+
80
+ shutil.copy(
81
+ "detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h",
82
+ "detectron2/layers/csrc/box_iou_rotated/hip/box_iou_rotated_utils.h",
83
+ )
84
+ shutil.copy(
85
+ "detectron2/layers/csrc/deformable/deform_conv.h",
86
+ "detectron2/layers/csrc/deformable/hip/deform_conv.h",
87
+ )
88
+
89
+ sources = [main_source] + sources
90
+ sources = [
91
+ s
92
+ for s in sources
93
+ if not is_rocm_pytorch or torch_ver < [1, 7] or not s.endswith("hip/vision.cpp")
94
+ ]
95
+
96
+ else:
97
+
98
+ # common code between cuda and rocm platforms,
99
+ # for hipify version [1,0,0] and later.
100
+
101
+ source_cuda = glob.glob(path.join(extensions_dir, "**", "*.cu")) + glob.glob(
102
+ path.join(extensions_dir, "*.cu")
103
+ )
104
+
105
+ sources = [main_source] + sources
106
+
107
+ extension = CppExtension
108
+
109
+ extra_compile_args = {"cxx": []}
110
+ define_macros = []
111
+
112
+ if (torch.cuda.is_available() and ((CUDA_HOME is not None) or is_rocm_pytorch)) or os.getenv(
113
+ "FORCE_CUDA", "0"
114
+ ) == "1":
115
+ extension = CUDAExtension
116
+ sources += source_cuda
117
+
118
+ if not is_rocm_pytorch:
119
+ define_macros += [("WITH_CUDA", None)]
120
+ extra_compile_args["nvcc"] = [
121
+ "-O3",
122
+ "-DCUDA_HAS_FP16=1",
123
+ "-D__CUDA_NO_HALF_OPERATORS__",
124
+ "-D__CUDA_NO_HALF_CONVERSIONS__",
125
+ "-D__CUDA_NO_HALF2_OPERATORS__",
126
+ ]
127
+ else:
128
+ define_macros += [("WITH_HIP", None)]
129
+ extra_compile_args["nvcc"] = []
130
+
131
+ if torch_ver < [1, 7]:
132
+ # supported by https://github.com/pytorch/pytorch/pull/43931
133
+ CC = os.environ.get("CC", None)
134
+ if CC is not None:
135
+ extra_compile_args["nvcc"].append("-ccbin={}".format(CC))
136
+
137
+ include_dirs = [extensions_dir]
138
+
139
+ ext_modules = [
140
+ extension(
141
+ "detectron2._C",
142
+ sources,
143
+ include_dirs=include_dirs,
144
+ define_macros=define_macros,
145
+ extra_compile_args=extra_compile_args,
146
+ )
147
+ ]
148
+
149
+ return ext_modules
150
+
151
+
152
+ def get_model_zoo_configs() -> List[str]:
153
+ """
154
+ Return a list of configs to include in package for model zoo. Copy over these configs inside
155
+ detectron2/model_zoo.
156
+ """
157
+
158
+ # Use absolute paths while symlinking.
159
+ source_configs_dir = path.join(path.dirname(path.realpath(__file__)), "configs")
160
+ destination = path.join(
161
+ path.dirname(path.realpath(__file__)), "detectron2", "model_zoo", "configs"
162
+ )
163
+ # Symlink the config directory inside package to have a cleaner pip install.
164
+
165
+ # Remove stale symlink/directory from a previous build.
166
+ if path.exists(source_configs_dir):
167
+ if path.islink(destination):
168
+ os.unlink(destination)
169
+ elif path.isdir(destination):
170
+ shutil.rmtree(destination)
171
+
172
+ if not path.exists(destination):
173
+ try:
174
+ os.symlink(source_configs_dir, destination)
175
+ except OSError:
176
+ # Fall back to copying if symlink fails: ex. on Windows.
177
+ shutil.copytree(source_configs_dir, destination)
178
+
179
+ config_paths = glob.glob("configs/**/*.yaml", recursive=True) + glob.glob(
180
+ "configs/**/*.py", recursive=True
181
+ )
182
+ return config_paths
183
+
184
+
185
+ # For projects that are relative small and provide features that are very close
186
+ # to detectron2's core functionalities, we install them under detectron2.projects
187
+ PROJECTS = {
188
+ # "detectron2.projects.point_rend": "projects/PointRend/point_rend",
189
+ # "detectron2.projects.deeplab": "projects/DeepLab/deeplab",
190
+ # "detectron2.projects.panoptic_deeplab": "projects/Panoptic-DeepLab/panoptic_deeplab",
191
+ }
192
+
193
+ setup(
194
+ name="detectron2",
195
+ version=get_version(),
196
+ author="FAIR",
197
+ url="https://github.com/facebookresearch/detectron2",
198
+ description="Detectron2 is FAIR's next-generation research "
199
+ "platform for object detection and segmentation.",
200
+ packages=find_packages(exclude=("configs", "tests*")) + list(PROJECTS.keys()),
201
+ package_dir=PROJECTS,
202
+ package_data={"detectron2.model_zoo": get_model_zoo_configs()},
203
+ python_requires=">=3.6",
204
+ install_requires=[
205
+ # Do not add opencv here. Just like pytorch, user should install
206
+ # opencv themselves, preferrably by OS's package manager, or by
207
+ # choosing the proper pypi package name at https://github.com/skvark/opencv-python
208
+ "termcolor>=1.1",
209
+ "Pillow>=7.1", # or use pillow-simd for better performance
210
+ "yacs>=0.1.6",
211
+ "tabulate",
212
+ "cloudpickle",
213
+ "matplotlib",
214
+ "tqdm>4.29.0",
215
+ "tensorboard",
216
+ # Lock version of fvcore/iopath because they may have breaking changes
217
+ # NOTE: when updating fvcore/iopath version, make sure fvcore depends
218
+ # on compatible version of iopath.
219
+ "fvcore>=0.1.5,<0.1.6", # required like this to make it pip installable
220
+ "iopath>=0.1.7,<0.1.9",
221
+ "pycocotools>=2.0.2", # corresponds to https://github.com/ppwwyyxx/cocoapi
222
+ "future", # used by caffe2
223
+ "pydot", # used to save caffe2 SVGs
224
+ "dataclasses; python_version<'3.7'",
225
+ "omegaconf>=2.1.0rc1",
226
+ "hydra-core>=1.1.0rc1",
227
+ "black==21.4b2",
228
+ # When adding to the list, may need to update docs/requirements.txt
229
+ # or add mock in docs/conf.py
230
+ ],
231
+ extras_require={
232
+ "all": [
233
+ "shapely",
234
+ "pygments>=2.2",
235
+ "psutil",
236
+ "panopticapi @ https://github.com/cocodataset/panopticapi/archive/master.zip",
237
+ ],
238
+ "dev": [
239
+ "flake8==3.8.1",
240
+ "isort==4.3.21",
241
+ "flake8-bugbear",
242
+ "flake8-comprehensions",
243
+ ],
244
+ },
245
+ ext_modules=get_extensions(),
246
+ cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
247
+ )
test_transfer_learning.sh ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluate our trained open-vocabulary object detectors, {RN50, RN50x4} x {COCO, LVIS}
2
+
3
+ # RN50, COCO (Generalized: Novel + Base)
4
+ python3 ./tools/train_net.py \
5
+ --eval-only \
6
+ --num-gpus 1 \
7
+ --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd.yaml \
8
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_finetuned-coco_rn50.pth \
9
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \
10
+ MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \
11
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb.pth \
12
+ MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
13
+ MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \
14
+
15
+ # # RN50, COCO (only Novel)
16
+ # # --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_testt.yaml \
17
+ # # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_17_target_cls_emb.pth \
18
+
19
+ # # RN50, COCO (only Base)
20
+ # # --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_testb.yaml \
21
+ # # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb.pth \
22
+
23
+
24
+ # # RN50, LVIS
25
+ # python3 ./tools/train_net.py \
26
+ # --eval-only \
27
+ # --num-gpus 1 \
28
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4.yaml \
29
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_finetuned-lvis_rn50.pth \
30
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
31
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \
32
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_866_base_cls_emb.pth \
33
+ # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
34
+ # MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \
35
+ # MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \
36
+
37
+
38
+
39
+ # # RN50x4, COCO (Generalized: Novel + Base)
40
+ # python3 ./tools/train_net.py \
41
+ # --eval-only \
42
+ # --num-gpus 1 \
43
+ # --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd.yaml \
44
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_finetuned-coco_rn50x4.pth \
45
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \
46
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \
47
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb_rn50x4.pth \
48
+ # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb_rn50x4.pth \
49
+ # MODEL.CLIP.TEXT_EMB_DIM 640 \
50
+ # MODEL.RESNETS.DEPTH 200 \
51
+ # MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
52
+ # MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \
53
+
54
+ # # RN50x4, COCO (only Novel)
55
+ # # --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_testt.yaml \
56
+ # # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_17_target_cls_emb_rn50x4.pth \
57
+
58
+ # # RN50x4, COCO (only Base)
59
+ # # --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_testb.yaml \
60
+ # # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb_rn50x4.pth \
61
+
62
+
63
+ # # RN50x4, LVIS
64
+ # python3 ./tools/train_net.py \
65
+ # --eval-only \
66
+ # --num-gpus 1 \
67
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4.yaml \
68
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_finetuned-lvis_rn50x4 \
69
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
70
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \
71
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_866_base_cls_emb_rn50x4.pth \
72
+ # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
73
+ # MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \
74
+ # MODEL.CLIP.TEXT_EMB_DIM 640 \
75
+ # MODEL.RESNETS.DEPTH 200 \
76
+ # MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
77
+ # MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION 18 \
78
+ # MODEL.RESNETS.RES2_OUT_CHANNELS 320 \
79
+ # MODEL.ROI_HEADS.SOFT_NMS_ENABLED True \
test_zeroshot_inference.sh ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # evaluate zero-shot inference {RN50, RN50x4} x {COCO, LVIS} x {GT, RPN}
2
+
3
+ # RN50, GT, COCO
4
+ python3 ./tools/train_net.py \
5
+ --eval-only \
6
+ --num-gpus 1 \
7
+ --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml \
8
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
9
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
10
+ MODEL.CLIP.CROP_REGION_TYPE GT \
11
+ MODEL.CLIP.MULTIPLY_RPN_SCORE False \
12
+
13
+ # # RN50, RPN, COCO
14
+ # python3 ./tools/train_net.py \
15
+ # --eval-only \
16
+ # --num-gpus 1 \
17
+ # --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml \
18
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
19
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
20
+ # MODEL.CLIP.CROP_REGION_TYPE RPN \
21
+ # MODEL.CLIP.MULTIPLY_RPN_SCORE True \
22
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
23
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
24
+
25
+
26
+ # # RN50, GT, LVIS
27
+ # python3 ./tools/train_net.py \
28
+ # --eval-only \
29
+ # --num-gpus 1 \
30
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
31
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
32
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
33
+ # MODEL.CLIP.CROP_REGION_TYPE GT \
34
+ # MODEL.CLIP.MULTIPLY_RPN_SCORE False \
35
+ # MODEL.ROI_HEADS.SCORE_THRESH_TEST 0.0001 \
36
+
37
+ # # RN50, RPN, LVIS
38
+ # python3 ./tools/train_net.py \
39
+ # --eval-only \
40
+ # --num-gpus 1 \
41
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
42
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
43
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
44
+ # MODEL.CLIP.CROP_REGION_TYPE RPN \
45
+ # MODEL.CLIP.MULTIPLY_RPN_SCORE True \
46
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
47
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
48
+
49
+
50
+
51
+ # # RN50x4, GT, COCO
52
+ # python3 ./tools/train_net.py \
53
+ # --eval-only \
54
+ # --num-gpus 1 \
55
+ # --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml \
56
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
57
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb_rn50x4.pth \
58
+ # MODEL.CLIP.CROP_REGION_TYPE GT \
59
+ # MODEL.CLIP.MULTIPLY_RPN_SCORE False \
60
+ # MODEL.CLIP.TEXT_EMB_DIM 640 \
61
+ # MODEL.RESNETS.DEPTH 200 \
62
+ # MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
63
+
64
+ # # RN50x4, RPN, COCO
65
+ # python3 ./tools/train_net.py \
66
+ # --eval-only \
67
+ # --num-gpus 1 \
68
+ # --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd_zsinf.yaml \
69
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
70
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb_rn50x4.pth \
71
+ # MODEL.CLIP.CROP_REGION_TYPE RPN \
72
+ # MODEL.CLIP.MULTIPLY_RPN_SCORE True \
73
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
74
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
75
+ # MODEL.CLIP.TEXT_EMB_DIM 640 \
76
+ # MODEL.RESNETS.DEPTH 200 \
77
+ # MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
78
+
79
+
80
+ # # RN50x4, GT, LVIS
81
+ # python3 ./tools/train_net.py \
82
+ # --eval-only \
83
+ # --num-gpus 1 \
84
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
85
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
86
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
87
+ # MODEL.CLIP.CROP_REGION_TYPE GT \
88
+ # MODEL.CLIP.MULTIPLY_RPN_SCORE False \
89
+ # MODEL.ROI_HEADS.SCORE_THRESH_TEST 0.0001 \
90
+ # MODEL.CLIP.TEXT_EMB_DIM 640 \
91
+ # MODEL.RESNETS.DEPTH 200 \
92
+ # MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
93
+
94
+ # # RN50x4, RPN, LVIS
95
+ # python3 ./tools/train_net.py \
96
+ # --eval-only \
97
+ # --num-gpus 1 \
98
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_zsinf.yaml \
99
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
100
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
101
+ # MODEL.CLIP.CROP_REGION_TYPE RPN \
102
+ # MODEL.CLIP.MULTIPLY_RPN_SCORE True \
103
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
104
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866.pth \
105
+ # MODEL.CLIP.TEXT_EMB_DIM 640 \
106
+ # MODEL.RESNETS.DEPTH 200 \
107
+ # MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
train_transfer_learning.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # train open-vocabulary object detectors (initialized by our pretrained RegionCLIP), {RN50, RN50x4} x {COCO, LVIS}
2
+
3
+ # RN50, COCO
4
+ python3 ./tools/train_net.py \
5
+ --num-gpus 1 \
6
+ --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd.yaml \
7
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
8
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \
9
+ MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \
10
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb.pth \
11
+ MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb.pth \
12
+
13
+ # # RN50, LVIS
14
+ # python3 ./tools/train_net.py \
15
+ # --num-gpus 1 \
16
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4.yaml \
17
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
18
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
19
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \
20
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_866_base_cls_emb.pth \
21
+ # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
22
+ # MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \
23
+
24
+
25
+
26
+ # # RN50x4, COCO
27
+ # python3 ./tools/train_net.py \
28
+ # --num-gpus 1 \
29
+ # --config-file ./configs/COCO-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_ovd.yaml \
30
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
31
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x_ovd_FSD.yaml \
32
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_coco_48.pth \
33
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_48_base_cls_emb_rn50x4.pth \
34
+ # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/coco_65_cls_emb_rn50x4.pth \
35
+ # MODEL.CLIP.TEXT_EMB_DIM 640 \
36
+ # MODEL.RESNETS.DEPTH 200 \
37
+ # MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
38
+
39
+ # # RN50x4, LVIS
40
+ # python3 ./tools/train_net.py \
41
+ # --num-gpus 1 \
42
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4.yaml \
43
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
44
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
45
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \
46
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_866_base_cls_emb_rn50x4.pth \
47
+ # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
48
+ # MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \
49
+ # MODEL.CLIP.TEXT_EMB_DIM 640 \
50
+ # MODEL.RESNETS.DEPTH 200 \
51
+ # MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
52
+ # MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION 18 \
53
+ # MODEL.RESNETS.RES2_OUT_CHANNELS 320 \
visualize_transfer_learning.sh ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # visualize detection results from finetuned detectors on custom images
2
+
3
+ ########################################################
4
+
5
+ # Open-vocabulary detector trained by 866 LVIS base categories, with RegionCLIP (RN50x4) as initialization
6
+ python3 ./tools/train_net.py \
7
+ --eval-only \
8
+ --num-gpus 1 \
9
+ --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_custom_img.yaml \
10
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_finetuned-lvis_rn50x4.pth \
11
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
12
+ MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
13
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
14
+ MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \
15
+ MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \
16
+ MODEL.CLIP.TEXT_EMB_DIM 640 \
17
+ MODEL.RESNETS.DEPTH 200 \
18
+ MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
19
+ MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION 18 \
20
+ MODEL.RESNETS.RES2_OUT_CHANNELS 320 \
21
+
22
+ # visualize the prediction json file
23
+ python ./tools/visualize_json_results.py \
24
+ --input ./output/inference/lvis_instances_results.json \
25
+ --output ./output/regions \
26
+ --dataset lvis_v1_val_custom_img \
27
+ --conf-threshold 0.05 \
28
+ --show-unique-boxes \
29
+ --max-boxes 25 \
30
+ --small-region-px 8100\
31
+
32
+
33
+ ########################################################
34
+
35
+ # Open-vocabulary detector trained by 866 LVIS base categories, with RegionCLIP (RN50) as initialization
36
+ # python3 ./tools/train_net.py \
37
+ # --eval-only \
38
+ # --num-gpus 1 \
39
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_custom_img.yaml \
40
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_finetuned-lvis_rn50.pth \
41
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
42
+ # MODEL.CLIP.OPENSET_TEST_TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
43
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
44
+ # MODEL.CLIP.BB_RPN_WEIGHTS ./pretrained_ckpt/rpn/rpn_lvis_866_lsj.pth \
45
+ # MODEL.CLIP.OFFLINE_RPN_LSJ_PRETRAINED True \
46
+
47
+ # # visualize the prediction json file
48
+ # python ./tools/visualize_json_results.py \
49
+ # --input ./output/inference/lvis_instances_results.json \
50
+ # --output ./output/regions \
51
+ # --dataset lvis_v1_val_custom_img \
52
+ # --conf-threshold 0.05 \
53
+ # --show-unique-boxes \
54
+ # --max-boxes 25 \
55
+ # --small-region-px 8100\
56
+
57
+
58
+
59
+
60
+
visualize_zeroshot_inference.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # visualize zero-shot inference results on custom images
2
+
3
+ ########################################################
4
+
5
+ # RegionCLIP (RN50x4)
6
+ python3 ./tools/train_net.py \
7
+ --eval-only \
8
+ --num-gpus 1 \
9
+ --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_custom_img.yaml \
10
+ MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50x4.pth \
11
+ MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb_rn50x4.pth \
12
+ MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
13
+ MODEL.CLIP.TEXT_EMB_DIM 640 \
14
+ MODEL.RESNETS.DEPTH 200 \
15
+ MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION 18 \
16
+
17
+ # visualize the prediction json file
18
+ python ./tools/visualize_json_results.py \
19
+ --input ./output/inference/lvis_instances_results.json \
20
+ --output ./output/regions \
21
+ --dataset lvis_v1_val_custom_img \
22
+ --conf-threshold 0.05 \
23
+ --show-unique-boxes \
24
+ --max-boxes 25 \
25
+ --small-region-px 8100\
26
+
27
+
28
+ ########################################################
29
+
30
+ # RegionCLIP (RN50)
31
+ # python3 ./tools/train_net.py \
32
+ # --eval-only \
33
+ # --num-gpus 1 \
34
+ # --config-file ./configs/LVISv1-InstanceSegmentation/CLIP_fast_rcnn_R_50_C4_custom_img.yaml \
35
+ # MODEL.WEIGHTS ./pretrained_ckpt/regionclip/regionclip_pretrained-cc_rn50.pth \
36
+ # MODEL.CLIP.TEXT_EMB_PATH ./pretrained_ckpt/concept_emb/lvis_1203_cls_emb.pth \
37
+ # MODEL.CLIP.OFFLINE_RPN_CONFIG ./configs/LVISv1-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \
38
+
39
+ # # visualize the prediction json file
40
+ # python ./tools/visualize_json_results.py \
41
+ # --input ./output/inference/lvis_instances_results.json \
42
+ # --output ./output/regions \
43
+ # --dataset lvis_v1_val_custom_img \
44
+ # --conf-threshold 0.05 \
45
+ # --show-unique-boxes \
46
+ # --max-boxes 25 \
47
+ # --small-region-px 8100\
48
+
49
+
50
+
51
+
52
+