matanru commited on
Commit
93b49a4
·
0 Parent(s):

initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. .idea/workspace.xml +60 -0
  3. LICENSE +203 -0
  4. README.md +14 -0
  5. app.py +442 -0
  6. configs/1shot-swin-clip/base_split1_config.py +195 -0
  7. configs/1shot-swin-clip/base_split2_config.py +195 -0
  8. configs/1shot-swin-clip/base_split3_config.py +195 -0
  9. configs/1shot-swin-clip/base_split4_config.py +195 -0
  10. configs/1shot-swin-clip/base_split5_config.py +195 -0
  11. configs/1shot-swin-clip/graph_split1_config.py +198 -0
  12. configs/1shot-swin-clip/graph_split2_config.py +197 -0
  13. configs/1shot-swin-clip/graph_split3_config.py +197 -0
  14. configs/1shot-swin-clip/graph_split4_config.py +197 -0
  15. configs/1shot-swin-clip/graph_split5_config.py +197 -0
  16. configs/1shot-swin-gte/base_split1_config.py +195 -0
  17. configs/1shot-swin-gte/base_split2_config.py +195 -0
  18. configs/1shot-swin-gte/base_split3_config.py +195 -0
  19. configs/1shot-swin-gte/base_split4_config.py +195 -0
  20. configs/1shot-swin-gte/base_split5_config.py +195 -0
  21. configs/1shot-swin-gte/graph_split1_config.py +199 -0
  22. configs/1shot-swin-gte/graph_split2_config.py +197 -0
  23. configs/1shot-swin-gte/graph_split3_config.py +197 -0
  24. configs/1shot-swin-gte/graph_split4_config.py +197 -0
  25. configs/1shot-swin-gte/graph_split5_config.py +197 -0
  26. configs/_base_/datasets/ap10k.py +142 -0
  27. configs/_base_/default_runtime.py +20 -0
  28. configs/demo_b.py +191 -0
  29. demo_text.py +212 -0
  30. docker/Dockerfile +59 -0
  31. environment.yml +201 -0
  32. examples/animal.png +0 -0
  33. examples/car.png +0 -0
  34. examples/chair.png +0 -0
  35. examples/person.png +0 -0
  36. models/VERSION +1 -0
  37. models/__init__.py +3 -0
  38. models/__pycache__/__init__.cpython-38.pyc +0 -0
  39. models/apis/__init__.py +5 -0
  40. models/apis/__pycache__/__init__.cpython-38.pyc +0 -0
  41. models/apis/__pycache__/train.cpython-38.pyc +0 -0
  42. models/apis/train.py +126 -0
  43. models/core/__init__.py +1 -0
  44. models/core/__pycache__/__init__.cpython-38.pyc +0 -0
  45. models/core/custom_hooks/__pycache__/shuffle_hooks.cpython-38.pyc +0 -0
  46. models/core/custom_hooks/shuffle_hooks.py +29 -0
  47. models/datasets/__init__.py +3 -0
  48. models/datasets/__pycache__/__init__.cpython-38.pyc +0 -0
  49. models/datasets/__pycache__/builder.cpython-38.pyc +0 -0
  50. models/datasets/builder.py +54 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.idea/workspace.xml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="AutoImportSettings">
4
+ <option name="autoReloadType" value="SELECTIVE" />
5
+ </component>
6
+ <component name="ChangeListManager">
7
+ <list default="true" id="1cdeceeb-6b27-4a56-9f44-8bb2d77c353f" name="Changes" comment="" />
8
+ <option name="SHOW_DIALOG" value="false" />
9
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
10
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
11
+ <option name="LAST_RESOLUTION" value="IGNORE" />
12
+ </component>
13
+ <component name="Git.Settings">
14
+ <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
15
+ </component>
16
+ <component name="ProjectColorInfo"><![CDATA[{
17
+ "associatedIndex": 4
18
+ }]]></component>
19
+ <component name="ProjectId" id="2hKQwKx3zpbH4D8IcAn5ZJcn2HY" />
20
+ <component name="ProjectViewState">
21
+ <option name="hideEmptyMiddlePackages" value="true" />
22
+ <option name="showLibraryContents" value="true" />
23
+ </component>
24
+ <component name="PropertiesComponent"><![CDATA[{
25
+ "keyToString": {
26
+ "RunOnceActivity.ShowReadmeOnStart": "true",
27
+ "git-widget-placeholder": "main",
28
+ "last_opened_file_path": "/home/matanru/huggingface/CapeX",
29
+ "node.js.detected.package.eslint": "true",
30
+ "node.js.detected.package.tslint": "true",
31
+ "node.js.selected.package.eslint": "(autodetect)",
32
+ "node.js.selected.package.tslint": "(autodetect)",
33
+ "nodejs_package_manager_path": "npm",
34
+ "vue.rearranger.settings.migration": "true"
35
+ }
36
+ }]]></component>
37
+ <component name="SharedIndexes">
38
+ <attachedChunks>
39
+ <set>
40
+ <option value="bundled-js-predefined-1d06a55b98c1-cb551a44b0f8-JavaScript-PY-242.10180.30" />
41
+ <option value="bundled-python-sdk-7efad6460ed6-db4a76ca2eac-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-242.10180.30" />
42
+ </set>
43
+ </attachedChunks>
44
+ </component>
45
+ <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
46
+ <component name="TaskManager">
47
+ <task active="true" id="Default" summary="Default task">
48
+ <changelist id="1cdeceeb-6b27-4a56-9f44-8bb2d77c353f" name="Changes" comment="" />
49
+ <created>1717340527309</created>
50
+ <option name="number" value="Default" />
51
+ <option name="presentableId" value="Default" />
52
+ <updated>1717340527309</updated>
53
+ <workItem from="1717340528499" duration="17535000" />
54
+ </task>
55
+ <servers />
56
+ </component>
57
+ <component name="TypeScriptGeneratedFilesManager">
58
+ <option name="version" value="3" />
59
+ </component>
60
+ </project>
LICENSE ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2022 SenseTime. All Rights Reserved.
2
+
3
+ Apache License
4
+ Version 2.0, January 2004
5
+ http://www.apache.org/licenses/
6
+
7
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8
+
9
+ 1. Definitions.
10
+
11
+ "License" shall mean the terms and conditions for use, reproduction,
12
+ and distribution as defined by Sections 1 through 9 of this document.
13
+
14
+ "Licensor" shall mean the copyright owner or entity authorized by
15
+ the copyright owner that is granting the License.
16
+
17
+ "Legal Entity" shall mean the union of the acting entity and all
18
+ other entities that control, are controlled by, or are under common
19
+ control with that entity. For the purposes of this definition,
20
+ "control" means (i) the power, direct or indirect, to cause the
21
+ direction or management of such entity, whether by contract or
22
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
23
+ outstanding shares, or (iii) beneficial ownership of such entity.
24
+
25
+ "You" (or "Your") shall mean an individual or Legal Entity
26
+ exercising permissions granted by this License.
27
+
28
+ "Source" form shall mean the preferred form for making modifications,
29
+ including but not limited to software source code, documentation
30
+ source, and configuration files.
31
+
32
+ "Object" form shall mean any form resulting from mechanical
33
+ transformation or translation of a Source form, including but
34
+ not limited to compiled object code, generated documentation,
35
+ and conversions to other media types.
36
+
37
+ "Work" shall mean the work of authorship, whether in Source or
38
+ Object form, made available under the License, as indicated by a
39
+ copyright notice that is included in or attached to the work
40
+ (an example is provided in the Appendix below).
41
+
42
+ "Derivative Works" shall mean any work, whether in Source or Object
43
+ form, that is based on (or derived from) the Work and for which the
44
+ editorial revisions, annotations, elaborations, or other modifications
45
+ represent, as a whole, an original work of authorship. For the purposes
46
+ of this License, Derivative Works shall not include works that remain
47
+ separable from, or merely link (or bind by name) to the interfaces of,
48
+ the Work and Derivative Works thereof.
49
+
50
+ "Contribution" shall mean any work of authorship, including
51
+ the original version of the Work and any modifications or additions
52
+ to that Work or Derivative Works thereof, that is intentionally
53
+ submitted to Licensor for inclusion in the Work by the copyright owner
54
+ or by an individual or Legal Entity authorized to submit on behalf of
55
+ the copyright owner. For the purposes of this definition, "submitted"
56
+ means any form of electronic, verbal, or written communication sent
57
+ to the Licensor or its representatives, including but not limited to
58
+ communication on electronic mailing lists, source code control systems,
59
+ and issue tracking systems that are managed by, or on behalf of, the
60
+ Licensor for the purpose of discussing and improving the Work, but
61
+ excluding communication that is conspicuously marked or otherwise
62
+ designated in writing by the copyright owner as "Not a Contribution."
63
+
64
+ "Contributor" shall mean Licensor and any individual or Legal Entity
65
+ on behalf of whom a Contribution has been received by Licensor and
66
+ subsequently incorporated within the Work.
67
+
68
+ 2. Grant of Copyright License. Subject to the terms and conditions of
69
+ this License, each Contributor hereby grants to You a perpetual,
70
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71
+ copyright license to reproduce, prepare Derivative Works of,
72
+ publicly display, publicly perform, sublicense, and distribute the
73
+ Work and such Derivative Works in Source or Object form.
74
+
75
+ 3. Grant of Patent License. Subject to the terms and conditions of
76
+ this License, each Contributor hereby grants to You a perpetual,
77
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78
+ (except as stated in this section) patent license to make, have made,
79
+ use, offer to sell, sell, import, and otherwise transfer the Work,
80
+ where such license applies only to those patent claims licensable
81
+ by such Contributor that are necessarily infringed by their
82
+ Contribution(s) alone or by combination of their Contribution(s)
83
+ with the Work to which such Contribution(s) was submitted. If You
84
+ institute patent litigation against any entity (including a
85
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
86
+ or a Contribution incorporated within the Work constitutes direct
87
+ or contributory patent infringement, then any patent licenses
88
+ granted to You under this License for that Work shall terminate
89
+ as of the date such litigation is filed.
90
+
91
+ 4. Redistribution. You may reproduce and distribute copies of the
92
+ Work or Derivative Works thereof in any medium, with or without
93
+ modifications, and in Source or Object form, provided that You
94
+ meet the following conditions:
95
+
96
+ (a) You must give any other recipients of the Work or
97
+ Derivative Works a copy of this License; and
98
+
99
+ (b) You must cause any modified files to carry prominent notices
100
+ stating that You changed the files; and
101
+
102
+ (c) You must retain, in the Source form of any Derivative Works
103
+ that You distribute, all copyright, patent, trademark, and
104
+ attribution notices from the Source form of the Work,
105
+ excluding those notices that do not pertain to any part of
106
+ the Derivative Works; and
107
+
108
+ (d) If the Work includes a "NOTICE" text file as part of its
109
+ distribution, then any Derivative Works that You distribute must
110
+ include a readable copy of the attribution notices contained
111
+ within such NOTICE file, excluding those notices that do not
112
+ pertain to any part of the Derivative Works, in at least one
113
+ of the following places: within a NOTICE text file distributed
114
+ as part of the Derivative Works; within the Source form or
115
+ documentation, if provided along with the Derivative Works; or,
116
+ within a display generated by the Derivative Works, if and
117
+ wherever such third-party notices normally appear. The contents
118
+ of the NOTICE file are for informational purposes only and
119
+ do not modify the License. You may add Your own attribution
120
+ notices within Derivative Works that You distribute, alongside
121
+ or as an addendum to the NOTICE text from the Work, provided
122
+ that such additional attribution notices cannot be construed
123
+ as modifying the License.
124
+
125
+ You may add Your own copyright statement to Your modifications and
126
+ may provide additional or different license terms and conditions
127
+ for use, reproduction, or distribution of Your modifications, or
128
+ for any such Derivative Works as a whole, provided Your use,
129
+ reproduction, and distribution of the Work otherwise complies with
130
+ the conditions stated in this License.
131
+
132
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
133
+ any Contribution intentionally submitted for inclusion in the Work
134
+ by You to the Licensor shall be under the terms and conditions of
135
+ this License, without any additional terms or conditions.
136
+ Notwithstanding the above, nothing herein shall supersede or modify
137
+ the terms of any separate license agreement you may have executed
138
+ with Licensor regarding such Contributions.
139
+
140
+ 6. Trademarks. This License does not grant permission to use the trade
141
+ names, trademarks, service marks, or product names of the Licensor,
142
+ except as required for reasonable and customary use in describing the
143
+ origin of the Work and reproducing the content of the NOTICE file.
144
+
145
+ 7. Disclaimer of Warranty. Unless required by applicable law or
146
+ agreed to in writing, Licensor provides the Work (and each
147
+ Contributor provides its Contributions) on an "AS IS" BASIS,
148
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149
+ implied, including, without limitation, any warranties or conditions
150
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151
+ PARTICULAR PURPOSE. You are solely responsible for determining the
152
+ appropriateness of using or redistributing the Work and assume any
153
+ risks associated with Your exercise of permissions under this License.
154
+
155
+ 8. Limitation of Liability. In no event and under no legal theory,
156
+ whether in tort (including negligence), contract, or otherwise,
157
+ unless required by applicable law (such as deliberate and grossly
158
+ negligent acts) or agreed to in writing, shall any Contributor be
159
+ liable to You for damages, including any direct, indirect, special,
160
+ incidental, or consequential damages of any character arising as a
161
+ result of this License or out of the use or inability to use the
162
+ Work (including but not limited to damages for loss of goodwill,
163
+ work stoppage, computer failure or malfunction, or any and all
164
+ other commercial damages or losses), even if such Contributor
165
+ has been advised of the possibility of such damages.
166
+
167
+ 9. Accepting Warranty or Additional Liability. While redistributing
168
+ the Work or Derivative Works thereof, You may choose to offer,
169
+ and charge a fee for, acceptance of support, warranty, indemnity,
170
+ or other liability obligations and/or rights consistent with this
171
+ License. However, in accepting such obligations, You may act only
172
+ on Your own behalf and on Your sole responsibility, not on behalf
173
+ of any other Contributor, and only if You agree to indemnify,
174
+ defend, and hold each Contributor harmless for any liability
175
+ incurred by, or claims asserted against, such Contributor by reason
176
+ of your accepting any such warranty or additional liability.
177
+
178
+ END OF TERMS AND CONDITIONS
179
+
180
+ APPENDIX: How to apply the Apache License to your work.
181
+
182
+ To apply the Apache License to your work, attach the following
183
+ boilerplate notice, with the fields enclosed by brackets "[]"
184
+ replaced with your own identifying information. (Don't include
185
+ the brackets!) The text should be enclosed in the appropriate
186
+ comment syntax for the file format. We also recommend that a
187
+ file or class name and description of purpose be included on the
188
+ same "printed page" as the copyright notice for easier
189
+ identification within third-party archives.
190
+
191
+ Copyright 2020 MMClassification Authors.
192
+
193
+ Licensed under the Apache License, Version 2.0 (the "License");
194
+ you may not use this file except in compliance with the License.
195
+ You may obtain a copy of the License at
196
+
197
+ http://www.apache.org/licenses/LICENSE-2.0
198
+
199
+ Unless required by applicable law or agreed to in writing, software
200
+ distributed under the License is distributed on an "AS IS" BASIS,
201
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202
+ See the License for the specific language governing permissions and
203
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: CapeX
3
+ emoji: 👁
4
+ colorFrom: indigo
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.36.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ python: 3.10.13
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import argparse
3
+ import random
4
+ import os
5
+
6
+ os.system('python setup.py develop')
7
+
8
+ import gradio as gr
9
+ import matplotlib
10
+ import numpy as np
11
+ import torch
12
+ from PIL import ImageDraw, Image
13
+ from matplotlib import pyplot as plt
14
+ from mmcv import Config
15
+ import json
16
+
17
+ # def replace_line(file_name, line_num, text):
18
+ # lines = open(file_name, 'r').readlines()
19
+ # lines[line_num] = text
20
+ # out = open(file_name, 'w')
21
+ # out.writelines(lines)
22
+ # out.close()
23
+
24
+ # def read_lines(file_name):
25
+ # lines = open(file_name, 'r').readlines()
26
+ # print(lines)
27
+
28
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/parallel/distributed.py", 7, "from mmengine import print_log\n")
29
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/parallel/distributed.py", 8, "from mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmengine.utils import digit_version\n")
30
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/parallel/registry.py", 3, 'from mmengine.registry import Registry\n')
31
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/fileio/io.py", 5, "from mmengine.utils import is_list_of\n")
32
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/checkpoint.py", 23, "from mmengine.utils import digit_version, mkdir_or_exist\nfrom mmengine.utils.dl_utils import load_url\n")
33
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/hook.py", 1, "from mmengine.registry import Registry\nfrom mmengine.utils import is_method_overridden\n")
34
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/evaluation.py",11, "from mmengine.utils import is_seq_of\n")
35
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/mlflow.py", 3, "from mmengine.utils.dl_utils import TORCH_VERSION\n")
36
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/tensorboard.py", 4, "from mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmengine.utils import digit_version\n")
37
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/text.py", 12, "from mmengine.utils import is_tuple_of, scandir\n")
38
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/wandb.py", 5, "from mmengine.utils import scandir\n")
39
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/optimizer.py", 11, "from mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmcv.utils import IS_NPU_AVAILABLE\nfrom mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm\n")
40
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/optimizer.py", 14, "from mmengine.utils import digit_version\n")
41
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/fp16_utils.py", 12, "from mmcv.utils import IS_NPU_AVAILABLE\nfrom mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmengine.utils import digit_version\n")
42
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/builder.py", 4, "from mmengine.registry import Registry\n")
43
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/optimizer/builder.py", 7, "from mmcv.utils import IS_NPU_AVAILABLE\nfrom mmengine.registry import Registry, build_from_cfg\n")
44
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/optimizer/default_constructor.py", 8, "from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm\nfrom mmengine.registry import build_from_cfg\nfrom mmengine.utils import is_list_of\n")
45
+
46
+ # def is_ipu_available() -> bool:
47
+ # try:
48
+ # import poptorch
49
+ # return poptorch.ipuHardwareIsAvailable()
50
+ # except ImportError:
51
+ # return False
52
+
53
+ # IS_IPU_AVAILABLE = str(is_ipu_available())
54
+
55
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/device/ipu/__init__.py", 1, f'IS_IPU_AVAILABLE = {IS_IPU_AVAILABLE}\n')
56
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/device/scatter_gather.py", 4, "from mmengine.utils import deprecated_api_warning\n")
57
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmcv/device/_functions.py", 5, "from mmengine.utils import deprecated_api_warning\n")
58
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmpose/__init__.py", 1, "from mmengine.utils import digit_version\nfrom mmcv import parse_version_info\n")
59
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmpose/__init__.py", 21, "import mmcv\nmmcv_version = digit_version(mmcv.__version__)\n")
60
+ # replace_line("/usr/local/lib/python3.10/site-packages/mmpose/core/optimizers/builder.py", 3, "from mmengine.registry import Registry, build_from_cfg")
61
+
62
+
63
+ from mmcv.runner import load_checkpoint
64
+ from mmpose.core import wrap_fp16_model
65
+ from mmpose.models import build_posenet
66
+ from torchvision import transforms
67
+
68
+ from demo_text import Resize_Pad
69
+ from models import *
70
+
71
+ import networkx as nx
72
+ import matplotlib.pyplot as plt
73
+ import ast
74
+ import cv2
75
+
76
+ import matplotlib
77
+ # matplotlib.use('agg')
78
+
79
+ def edges_prompt_to_list(prompt):
80
+ if prompt[0] != "[":
81
+ prompt = "[" + prompt
82
+ if prompt[-1] != "]":
83
+ prompt += "]"
84
+ return ast.literal_eval(prompt)
85
+
86
+ def descriptions_prompt_to_list(prompt):
87
+ return prompt.split(',')
88
+
89
+
90
+ # Function to visualize the graph
91
+ def visualize_graph(node_descriptions, edges):
92
+ plt.close('all')
93
+ node_descriptions = descriptions_prompt_to_list(node_descriptions)
94
+ edges = edges_prompt_to_list(edges)
95
+
96
+ # Create an empty graph
97
+ G = nx.Graph()
98
+ G.clear()
99
+
100
+ # Add nodes with descriptions
101
+ for i, desc in enumerate(node_descriptions):
102
+ G.add_node(i, label=desc)
103
+
104
+ # Add edges
105
+ for edge in edges:
106
+ G.add_edge(edge[0], edge[1])
107
+
108
+ # Draw the graph
109
+ pos = nx.spring_layout(G) # Define layout
110
+ labels = nx.get_node_attributes(G, 'label') # Get labels
111
+ nx.draw(G, pos, with_labels=True, labels=labels, node_size=1500, node_color='skyblue', font_size=10, font_weight='bold', font_color='black') # Draw nodes with labels
112
+ nx.draw_networkx_edges(G, pos, width=2, edge_color='gray') # Draw edges
113
+ plt.title("Graph Visualization") # Set title
114
+ plt.axis('off') # Turn off axis
115
+ # plt.show() # Show plot
116
+ # Image from plot
117
+ fig = plt.gcf()
118
+ # fig.tight_layout(pad=0)
119
+
120
+ # To remove the huge white borders
121
+ # plt.margins(0)
122
+
123
+ fig.canvas.draw()
124
+ image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
125
+ image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (3,))
126
+ plt.clf()
127
+ return image_from_plot
128
+
129
+ checkpoint_path = ''
130
+
131
+
132
+
133
+ def plot_query_results(query_img, query_w, skeleton, prediction, radius=6):
134
+ h, w, c = query_img.shape
135
+ prediction = prediction[-1].cpu().numpy() * h
136
+ # prediction = prediction.cpu().numpy() * h
137
+ query_img = (query_img - np.min(query_img)) / (
138
+ np.max(query_img) - np.min(query_img))
139
+ for id, (img, w, keypoint) in enumerate(zip([query_img],
140
+ [query_w],
141
+ [prediction])):
142
+ f, axes = plt.subplots()
143
+ plt.imshow(img)
144
+ for k in range(keypoint.shape[0]):
145
+ if w[k] > 0:
146
+ kp = keypoint[k, :2]
147
+ c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6)
148
+ patch = plt.Circle(kp, radius, color=c)
149
+ axes.add_patch(patch)
150
+ axes.text(kp[0], kp[1], k)
151
+ plt.draw()
152
+ for l, limb in enumerate(skeleton):
153
+ kp = keypoint[:, :2]
154
+ if l > len(COLORS) - 1:
155
+ c = [x / 255 for x in random.sample(range(0, 255), 3)]
156
+ else:
157
+ c = [x / 255 for x in COLORS[l]]
158
+ if w[limb[0]] > 0 and w[limb[1]] > 0:
159
+ patch = plt.Line2D([kp[limb[0], 0], kp[limb[1], 0]],
160
+ [kp[limb[0], 1], kp[limb[1], 1]],
161
+ linewidth=6, color=c, alpha=0.6)
162
+ axes.add_artist(patch)
163
+ plt.axis('off') # command for hiding the axis.
164
+ plt.subplots_adjust(0, 0, 1, 1, 0, 0)
165
+ plt.margins(0)
166
+ fig = plt.gcf()
167
+ fig.tight_layout(pad=0)
168
+
169
+ return plt
170
+
171
+ COLORS = [
172
+ [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
173
+ [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
174
+ [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
175
+ [255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]
176
+ ]
177
+
178
+ model = None
179
+
180
+ # @spaces.GPU(duration=30)
181
+ # def estimate(model, data):
182
+ # with torch.no_grad():
183
+ # model_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
184
+ # data["img_q"].to(device=model_device)
185
+ # data['target_weight_s'][0].to(device=model_device)
186
+ # print(f'img type: {data["img_q"].dtype}, target_weight type: {data["target_weight_s"][0].dtype}')
187
+ # model.to(model_device)
188
+ # model.eval()
189
+ # # return model(**data)
190
+ # return model(str(data))
191
+
192
+ # @spaces.GPU(duration=30)
193
+ def estimate(data):
194
+ global model
195
+ with torch.no_grad():
196
+ # model_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
197
+ # data["img_q"].to(device=model_device)
198
+ # data['target_weight_s'][0].to(device=model_device)
199
+
200
+ return model(data)
201
+
202
+
203
+ # Custom JSON encoder to handle non-serializable objects
204
+ class CustomEncoder(json.JSONEncoder):
205
+ def default(self, obj):
206
+ if isinstance(obj, np.ndarray):
207
+ return obj.tolist()
208
+ return super().default(obj)
209
+
210
+
211
+ def process(query_img, node_descriptions, edges,
212
+ cfg_path='configs/1shot-swin-gte/graph_split1_config.py'):
213
+ global model
214
+ node_descriptions = descriptions_prompt_to_list(node_descriptions)
215
+ edges = edges_prompt_to_list(edges)
216
+ cfg = Config.fromfile(cfg_path)
217
+ kp_src_tensor = torch.zeros((len(node_descriptions), 2))
218
+ preprocess = transforms.Compose([
219
+ transforms.ToTensor(),
220
+ transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
221
+ Resize_Pad(cfg.model.encoder_config.img_size,
222
+ cfg.model.encoder_config.img_size)])
223
+
224
+ if len(edges) == 0:
225
+ edges = [(0, 0)]
226
+
227
+ #model_device = "cuda" if torch.cuda.is_available() else "cpu"
228
+
229
+ np_query = np.array(query_img)[:, :, ::-1].copy()
230
+ q_img = preprocess(np_query).flip(0)[None] #.to(model_device)
231
+ # Create heatmap from keypoints
232
+ genHeatMap = TopDownGenerateTargetFewShot()
233
+ data_cfg = cfg.data_cfg
234
+ data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size,
235
+ cfg.model.encoder_config.img_size])
236
+ data_cfg['joint_weights'] = None
237
+ data_cfg['use_different_joint_weights'] = False
238
+ kp_src_3d = torch.cat(
239
+ (kp_src_tensor, torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
240
+ kp_src_3d_weight = torch.cat(
241
+ (torch.ones_like(kp_src_tensor),
242
+ torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
243
+ target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg,
244
+ kp_src_3d,
245
+ kp_src_3d_weight,
246
+ sigma=1)
247
+ target_s = torch.tensor(target_s).float()[None]
248
+ target_weight_s = torch.ones_like(
249
+ torch.tensor(target_weight_s).float()[None]) #.to(model_device)
250
+
251
+ data = {
252
+ 'img_s': [0],
253
+ 'img_q': q_img,
254
+ 'target_s': [target_s],
255
+ 'target_weight_s': [target_weight_s],
256
+ 'target_q': None,
257
+ 'target_weight_q': None,
258
+ 'return_loss': False,
259
+ 'img_metas': [{'sample_skeleton': [edges],
260
+ 'query_skeleton': edges,
261
+ # 'sample_point_descriptions': np.array([node_descriptions]),
262
+ 'sample_point_descriptions': node_descriptions,
263
+ 'sample_joints_3d': [kp_src_3d],
264
+ 'query_joints_3d': kp_src_3d,
265
+ 'sample_center': [kp_src_tensor.mean(dim=0)],
266
+ 'query_center': kp_src_tensor.mean(dim=0),
267
+ 'sample_scale': [
268
+ kp_src_tensor.max(dim=0)[0] -
269
+ kp_src_tensor.min(dim=0)[0]],
270
+ 'query_scale': kp_src_tensor.max(dim=0)[0] -
271
+ kp_src_tensor.min(dim=0)[0],
272
+ 'sample_rotation': [0],
273
+ 'query_rotation': 0,
274
+ 'sample_bbox_score': [1],
275
+ 'query_bbox_score': 1,
276
+ 'query_image_file': '',
277
+ 'sample_image_file': [''],
278
+ }]
279
+ }
280
+ # Load model
281
+ model = build_posenet(cfg.model)
282
+ fp16_cfg = cfg.get('fp16', None)
283
+ if fp16_cfg is not None:
284
+ wrap_fp16_model(model)
285
+ load_checkpoint(model, checkpoint_path, map_location='cpu')
286
+ #model.to(model_device)
287
+ #model.eval()
288
+
289
+ # with torch.no_grad():
290
+ # outputs = model(**data)
291
+
292
+ data["img_q"] = data["img_q"].cpu().numpy().tolist()
293
+ data['target_weight_s'][0] = data['target_weight_s'][0].cpu().numpy().tolist()
294
+ data['target_s'][0] = data['target_s'][0].cpu().numpy().tolist()
295
+
296
+ data['img_metas'][0]['sample_joints_3d'][0] = data['img_metas'][0]['sample_joints_3d'][0].cpu().tolist()
297
+ data['img_metas'][0]['query_joints_3d'] = data['img_metas'][0]['query_joints_3d'].cpu().tolist()
298
+ data['img_metas'][0]['sample_center'][0] = data['img_metas'][0]['sample_center'][0].cpu().tolist()
299
+ data['img_metas'][0]['query_center'] = data['img_metas'][0]['query_center'].cpu().tolist()
300
+ data['img_metas'][0]['sample_scale'][0] = data['img_metas'][0]['sample_scale'][0].cpu().tolist()
301
+ data['img_metas'][0]['query_scale'] = data['img_metas'][0]['query_scale'].cpu().tolist()
302
+
303
+ # # data['img_metas'][0]['sample_point_descriptions'] = data['img_metas'][0]['sample_point_descriptions'].tolist()
304
+
305
+
306
+ #model.cuda()
307
+ model.eval()
308
+ # return model(**data)
309
+ # with torch.no_grad():
310
+ # outputs = model(**data)
311
+ str_data = json.dumps(data, cls=CustomEncoder)
312
+
313
+ outputs = estimate(str_data)
314
+ #outputs = estimate(**data)
315
+
316
+ # visualize results
317
+ vis_q_weight = target_weight_s[0]
318
+ vis_q_image = q_img[0].detach().cpu().numpy().transpose(1, 2, 0)
319
+
320
+ out = plot_query_results(vis_q_image, vis_q_weight, edges, torch.tensor(outputs['points']).squeeze(0))
321
+ return out
322
+
323
+
324
+ def update_examples(query_img, node_descriptions, edges):
325
+
326
+ return query_img, node_descriptions, edges
327
+
328
+
329
+ with gr.Blocks() as demo:
330
+ state = gr.State({
331
+ 'kp_src': [],
332
+ 'skeleton': [],
333
+ 'count': 0,
334
+ 'color_idx': 0,
335
+ 'prev_pt': None,
336
+ 'prev_pt_idx': None,
337
+ 'prev_clicked': None,
338
+ 'point_descriptions': None,
339
+ })
340
+ gr.Markdown('''
341
+ # CapeX Demo
342
+ We present a novel category agnostic pose estimation approach that utilizes support text-graphs
343
+ (graphs with text on its nodes), instead of the conventional techniques that use support images.
344
+ By leveraging the abstraction power of text-graphs, CapeX showcases SOTA results on MP100 while dropping the need
345
+ of providing an annotated support image.
346
+ ### [Paper](https://arxiv.org/pdf/2406.00384) | [GitHub](https://github.com/matanr/capex)
347
+ ## Instructions
348
+ 1. Explain using text the desired keypoints. Pleaser refer to the example for the right format.
349
+ 2. Optionally provide a graph representing the connections between the keypoints. Pleaser refer to the example for the right format.
350
+ 3. Upload an image of the object you want to pose to the query image.
351
+ 4. Click **Evaluate** to pose the query image.
352
+ ''')
353
+ with gr.Row():
354
+ # Input block for node descriptions
355
+ node_descriptions = gr.Textbox(label="Node Descriptions (String separated by commas)", lines=5, type="text",
356
+ # value="left eye, right eye, nose, neck, root of tail, left shoulder, left elbow, "
357
+ # "left front paw, right shoulder, right elbow, right front paw, left hip, "
358
+ # "left knee, left back paw, right hip, right knee, right back paw"
359
+ value="left eye, nose, right eye"
360
+ )
361
+
362
+ # Input block for edges
363
+ edges = gr.Textbox(label="Edges (List of 2-valued lists representing connections)", lines=5, type="text",
364
+ # value="[[0, 1], [0, 2], [1, 2], [2, 3], [3, 4], [3, 5], [5, 6], [6, 7], [3, 8], "
365
+ # "[8, 9], [9, 10], [4, 11], [11, 12], [12, 13], [4, 14], [14, 15], [15, 16]]"
366
+ value="[[0,1], [1,2]]"
367
+ )
368
+
369
+ def set_initial_text_graph():
370
+ text_graph = visualize_graph("left eye, nose, right eye", "[[0,1], [1,2]]")
371
+ return text_graph
372
+
373
+ text_graph = gr.Image(label="Text-graph visualization",
374
+ value=set_initial_text_graph,
375
+ type="pil", height=400, width=400)
376
+
377
+ with gr.Row():
378
+ query_img = gr.Image(label="Query Image",
379
+ type="pil", height=400, width=400)
380
+ with gr.Row():
381
+ eval_btn = gr.Button(value="Evaluate")
382
+ with gr.Row():
383
+ output_img = gr.Plot(label="Output Image")
384
+ with gr.Row():
385
+ gr.Markdown("## Examples")
386
+ with gr.Row():
387
+ gr.Examples(
388
+ examples=[
389
+ ['examples/animal.png',
390
+ "left eye, right eye, nose, neck, root of tail, left shoulder, left elbow, "
391
+ "left front paw, right shoulder, right elbow, right front paw, left hip, "
392
+ "left knee, left back paw, right hip, right knee, right back paw",
393
+ "[[0, 1], [0, 2], [1, 2], [2, 3], [3, 4], [3, 5], [5, 6], [6, 7], [3, 8], [8, 9],"
394
+ "[9, 10], [4, 11], [11, 12], [12, 13], [4, 14], [14, 15], [15, 16]]"
395
+ ],
396
+ ['examples/person.png',
397
+ "nose, left eye, right eye, left ear, right ear, left shoulder, right shoulder, left elbow, "
398
+ "right elbow, left wrist, right wrist, left hip, right hip, left knee, right knee, left ankle, "
399
+ "right ankle",
400
+ "[[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7],"
401
+ "[6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]]"
402
+ ],
403
+ ['examples/chair.png',
404
+ "left and front leg, right and front leg, right and back leg, left and back leg, "
405
+ "left and front side of the seat, right and front side of the seat, right and back side of the seat, "
406
+ "left and back side of the seat, top left side of the backseat, top right side of the backseat",
407
+ "[[0, 4], [3, 7], [1, 5], [2, 6], [4, 5], [5, 6], [6, 7], [7, 4], [6, 7], [7, 8],[8, 9], [9, 6]]",
408
+ ],
409
+ ['examples/car.png',
410
+ "front and right wheel, front and left wheel, rear and right wheel, rear and left wheel, "
411
+ "right headlight, left headlight, right taillight, left taillight, "
412
+ "front and right side of the top, front and left side of the top, rear and right side of the top, "
413
+ "rear and left side of the top",
414
+ "[[0, 2], [1, 3], [0, 1], [2, 3], [8, 10], [9, 11], [8, 9], [10, 11], [4, 0], "
415
+ "[4, 8], [4, 5], [5, 1], [5, 9], [6, 2], [6, 10], [7, 3], [7, 11], [6, 7]]"
416
+ ]
417
+ ],
418
+ inputs=[query_img, node_descriptions, edges],
419
+ outputs=[query_img, node_descriptions, edges],
420
+ fn=update_examples,
421
+ run_on_click=True,
422
+ )
423
+
424
+ eval_btn.click(fn=process,
425
+ inputs=[query_img, node_descriptions, edges],
426
+ outputs=[output_img])
427
+
428
+ node_descriptions.change(visualize_graph, inputs=[node_descriptions, edges], outputs=[text_graph])
429
+ edges.input(visualize_graph, inputs=[node_descriptions, edges], outputs=[text_graph])
430
+
431
+ # visualize_button.click(fn=visualize_graph,
432
+ # inputs=[node_descriptions, edges, state],
433
+ # outputs=[text_graph, state])
434
+
435
+ if __name__ == "__main__":
436
+ parser = argparse.ArgumentParser(description='CapeX Demo')
437
+ parser.add_argument('--checkpoint',
438
+ help='checkpoint path',
439
+ default='swin-gte-split1.pth')
440
+ args = parser.parse_args()
441
+ checkpoint_path = args.checkpoint
442
+ demo.launch()
configs/1shot-swin-clip/base_split1_config.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained="ViT-B/32",
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=512,
67
+ transformer=dict(
68
+ type='EncoderDecoder',
69
+ d_model=256,
70
+ nhead=8,
71
+ num_encoder_layers=3,
72
+ num_decoder_layers=3,
73
+ dim_feedforward=768,
74
+ dropout=0.1,
75
+ similarity_proj_dim=256,
76
+ dynamic_proj_dim=128,
77
+ activation="relu",
78
+ normalize_before=False,
79
+ return_intermediate_dec=True),
80
+ share_kpt_branch=False,
81
+ num_decoder_layer=3,
82
+ with_heatmap_loss=True,
83
+
84
+ heatmap_loss_weight=2.0,
85
+ support_order_dropout=-1,
86
+ positional_encoding=dict(
87
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
88
+ # training and testing settings
89
+ train_cfg=dict(),
90
+ test_cfg=dict(
91
+ flip_test=False,
92
+ post_process='default',
93
+ shift_heatmap=True,
94
+ modulate_kernel=11))
95
+
96
+ data_cfg = dict(
97
+ image_size=[256, 256],
98
+ heatmap_size=[64, 64],
99
+ num_output_channels=channel_cfg['num_output_channels'],
100
+ num_joints=channel_cfg['dataset_joints'],
101
+ dataset_channel=channel_cfg['dataset_channel'],
102
+ inference_channel=channel_cfg['inference_channel'])
103
+
104
+ train_pipeline = [
105
+ dict(type='LoadImageFromFile'),
106
+ dict(
107
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
108
+ scale_factor=0.15),
109
+ dict(type='TopDownAffineFewShot'),
110
+ dict(type='ToTensor'),
111
+ dict(
112
+ type='NormalizeTensor',
113
+ mean=[0.485, 0.456, 0.406],
114
+ std=[0.229, 0.224, 0.225]),
115
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
116
+ dict(
117
+ type='Collect',
118
+ keys=['img', 'target', 'target_weight'],
119
+ meta_keys=[
120
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
122
+ ]),
123
+ ]
124
+
125
+ valid_pipeline = [
126
+ dict(type='LoadImageFromFile'),
127
+ dict(type='TopDownAffineFewShot'),
128
+ dict(type='ToTensor'),
129
+ dict(
130
+ type='NormalizeTensor',
131
+ mean=[0.485, 0.456, 0.406],
132
+ std=[0.229, 0.224, 0.225]),
133
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
134
+ dict(
135
+ type='Collect',
136
+ keys=['img', 'target', 'target_weight'],
137
+ meta_keys=[
138
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
139
+ 'flip_pairs', 'category_id',
140
+ 'skeleton',
141
+ ]),
142
+ ]
143
+
144
+ test_pipeline = valid_pipeline
145
+
146
+ data_root = 'data/mp100'
147
+ data = dict(
148
+ samples_per_gpu=16,
149
+ workers_per_gpu=16,
150
+ # samples_per_gpu=8,
151
+ # workers_per_gpu=8,
152
+ train=dict(
153
+ type='TransformerPoseDataset',
154
+ ann_file=f'{data_root}/annotations/mp100_split1_train.json',
155
+ img_prefix=f'{data_root}/images/',
156
+ # img_prefix=f'{data_root}',
157
+ data_cfg=data_cfg,
158
+ valid_class_ids=None,
159
+ max_kpt_num=channel_cfg['max_kpt_num'],
160
+ num_shots=1,
161
+ pipeline=train_pipeline),
162
+ val=dict(
163
+ type='TransformerPoseDataset',
164
+ ann_file=f'{data_root}/annotations/mp100_split1_val.json',
165
+ img_prefix=f'{data_root}/images/',
166
+ # img_prefix=f'{data_root}',
167
+ data_cfg=data_cfg,
168
+ valid_class_ids=None,
169
+ max_kpt_num=channel_cfg['max_kpt_num'],
170
+ num_shots=1,
171
+ num_queries=15,
172
+ num_episodes=100,
173
+ pipeline=valid_pipeline),
174
+ test=dict(
175
+ type='TestPoseDataset',
176
+ ann_file=f'{data_root}/annotations/mp100_split1_test.json',
177
+ img_prefix=f'{data_root}/images/',
178
+ # img_prefix=f'{data_root}',
179
+ data_cfg=data_cfg,
180
+ valid_class_ids=None,
181
+ max_kpt_num=channel_cfg['max_kpt_num'],
182
+ num_shots=1,
183
+ num_queries=15,
184
+ num_episodes=200,
185
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
186
+ pipeline=test_pipeline),
187
+ )
188
+ vis_backends = [
189
+ dict(type='LocalVisBackend'),
190
+ dict(type='TensorboardVisBackend'),
191
+ ]
192
+ visualizer = dict(
193
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
194
+
195
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-clip/base_split2_config.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained="ViT-B/32",
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=512,
67
+ transformer=dict(
68
+ type='EncoderDecoder',
69
+ d_model=256,
70
+ nhead=8,
71
+ num_encoder_layers=3,
72
+ num_decoder_layers=3,
73
+ dim_feedforward=768,
74
+ dropout=0.1,
75
+ similarity_proj_dim=256,
76
+ dynamic_proj_dim=128,
77
+ activation="relu",
78
+ normalize_before=False,
79
+ return_intermediate_dec=True),
80
+ share_kpt_branch=False,
81
+ num_decoder_layer=3,
82
+ with_heatmap_loss=True,
83
+
84
+ heatmap_loss_weight=2.0,
85
+ support_order_dropout=-1,
86
+ positional_encoding=dict(
87
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
88
+ # training and testing settings
89
+ train_cfg=dict(),
90
+ test_cfg=dict(
91
+ flip_test=False,
92
+ post_process='default',
93
+ shift_heatmap=True,
94
+ modulate_kernel=11))
95
+
96
+ data_cfg = dict(
97
+ image_size=[256, 256],
98
+ heatmap_size=[64, 64],
99
+ num_output_channels=channel_cfg['num_output_channels'],
100
+ num_joints=channel_cfg['dataset_joints'],
101
+ dataset_channel=channel_cfg['dataset_channel'],
102
+ inference_channel=channel_cfg['inference_channel'])
103
+
104
+ train_pipeline = [
105
+ dict(type='LoadImageFromFile'),
106
+ dict(
107
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
108
+ scale_factor=0.15),
109
+ dict(type='TopDownAffineFewShot'),
110
+ dict(type='ToTensor'),
111
+ dict(
112
+ type='NormalizeTensor',
113
+ mean=[0.485, 0.456, 0.406],
114
+ std=[0.229, 0.224, 0.225]),
115
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
116
+ dict(
117
+ type='Collect',
118
+ keys=['img', 'target', 'target_weight'],
119
+ meta_keys=[
120
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
122
+ ]),
123
+ ]
124
+
125
+ valid_pipeline = [
126
+ dict(type='LoadImageFromFile'),
127
+ dict(type='TopDownAffineFewShot'),
128
+ dict(type='ToTensor'),
129
+ dict(
130
+ type='NormalizeTensor',
131
+ mean=[0.485, 0.456, 0.406],
132
+ std=[0.229, 0.224, 0.225]),
133
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
134
+ dict(
135
+ type='Collect',
136
+ keys=['img', 'target', 'target_weight'],
137
+ meta_keys=[
138
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
139
+ 'flip_pairs', 'category_id',
140
+ 'skeleton',
141
+ ]),
142
+ ]
143
+
144
+ test_pipeline = valid_pipeline
145
+
146
+ data_root = 'data/mp100'
147
+ data = dict(
148
+ samples_per_gpu=16,
149
+ workers_per_gpu=16,
150
+ # samples_per_gpu=8,
151
+ # workers_per_gpu=8,
152
+ train=dict(
153
+ type='TransformerPoseDataset',
154
+ ann_file=f'{data_root}/annotations/mp100_split2_train.json',
155
+ img_prefix=f'{data_root}/images/',
156
+ # img_prefix=f'{data_root}',
157
+ data_cfg=data_cfg,
158
+ valid_class_ids=None,
159
+ max_kpt_num=channel_cfg['max_kpt_num'],
160
+ num_shots=1,
161
+ pipeline=train_pipeline),
162
+ val=dict(
163
+ type='TransformerPoseDataset',
164
+ ann_file=f'{data_root}/annotations/mp100_split2_val.json',
165
+ img_prefix=f'{data_root}/images/',
166
+ # img_prefix=f'{data_root}',
167
+ data_cfg=data_cfg,
168
+ valid_class_ids=None,
169
+ max_kpt_num=channel_cfg['max_kpt_num'],
170
+ num_shots=1,
171
+ num_queries=15,
172
+ num_episodes=100,
173
+ pipeline=valid_pipeline),
174
+ test=dict(
175
+ type='TestPoseDataset',
176
+ ann_file=f'{data_root}/annotations/mp100_split2_test.json',
177
+ img_prefix=f'{data_root}/images/',
178
+ # img_prefix=f'{data_root}',
179
+ data_cfg=data_cfg,
180
+ valid_class_ids=None,
181
+ max_kpt_num=channel_cfg['max_kpt_num'],
182
+ num_shots=1,
183
+ num_queries=15,
184
+ num_episodes=200,
185
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
186
+ pipeline=test_pipeline),
187
+ )
188
+ vis_backends = [
189
+ dict(type='LocalVisBackend'),
190
+ dict(type='TensorboardVisBackend'),
191
+ ]
192
+ visualizer = dict(
193
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
194
+
195
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-clip/base_split3_config.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained="ViT-B/32",
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=512,
67
+ transformer=dict(
68
+ type='EncoderDecoder',
69
+ d_model=256,
70
+ nhead=8,
71
+ num_encoder_layers=3,
72
+ num_decoder_layers=3,
73
+ dim_feedforward=768,
74
+ dropout=0.1,
75
+ similarity_proj_dim=256,
76
+ dynamic_proj_dim=128,
77
+ activation="relu",
78
+ normalize_before=False,
79
+ return_intermediate_dec=True),
80
+ share_kpt_branch=False,
81
+ num_decoder_layer=3,
82
+ with_heatmap_loss=True,
83
+
84
+ heatmap_loss_weight=2.0,
85
+ support_order_dropout=-1,
86
+ positional_encoding=dict(
87
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
88
+ # training and testing settings
89
+ train_cfg=dict(),
90
+ test_cfg=dict(
91
+ flip_test=False,
92
+ post_process='default',
93
+ shift_heatmap=True,
94
+ modulate_kernel=11))
95
+
96
+ data_cfg = dict(
97
+ image_size=[256, 256],
98
+ heatmap_size=[64, 64],
99
+ num_output_channels=channel_cfg['num_output_channels'],
100
+ num_joints=channel_cfg['dataset_joints'],
101
+ dataset_channel=channel_cfg['dataset_channel'],
102
+ inference_channel=channel_cfg['inference_channel'])
103
+
104
+ train_pipeline = [
105
+ dict(type='LoadImageFromFile'),
106
+ dict(
107
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
108
+ scale_factor=0.15),
109
+ dict(type='TopDownAffineFewShot'),
110
+ dict(type='ToTensor'),
111
+ dict(
112
+ type='NormalizeTensor',
113
+ mean=[0.485, 0.456, 0.406],
114
+ std=[0.229, 0.224, 0.225]),
115
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
116
+ dict(
117
+ type='Collect',
118
+ keys=['img', 'target', 'target_weight'],
119
+ meta_keys=[
120
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
122
+ ]),
123
+ ]
124
+
125
+ valid_pipeline = [
126
+ dict(type='LoadImageFromFile'),
127
+ dict(type='TopDownAffineFewShot'),
128
+ dict(type='ToTensor'),
129
+ dict(
130
+ type='NormalizeTensor',
131
+ mean=[0.485, 0.456, 0.406],
132
+ std=[0.229, 0.224, 0.225]),
133
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
134
+ dict(
135
+ type='Collect',
136
+ keys=['img', 'target', 'target_weight'],
137
+ meta_keys=[
138
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
139
+ 'flip_pairs', 'category_id',
140
+ 'skeleton',
141
+ ]),
142
+ ]
143
+
144
+ test_pipeline = valid_pipeline
145
+
146
+ data_root = 'data/mp100'
147
+ data = dict(
148
+ samples_per_gpu=16,
149
+ workers_per_gpu=16,
150
+ # samples_per_gpu=8,
151
+ # workers_per_gpu=8,
152
+ train=dict(
153
+ type='TransformerPoseDataset',
154
+ ann_file=f'{data_root}/annotations/mp100_split3_train.json',
155
+ img_prefix=f'{data_root}/images/',
156
+ # img_prefix=f'{data_root}',
157
+ data_cfg=data_cfg,
158
+ valid_class_ids=None,
159
+ max_kpt_num=channel_cfg['max_kpt_num'],
160
+ num_shots=1,
161
+ pipeline=train_pipeline),
162
+ val=dict(
163
+ type='TransformerPoseDataset',
164
+ ann_file=f'{data_root}/annotations/mp100_split3_val.json',
165
+ img_prefix=f'{data_root}/images/',
166
+ # img_prefix=f'{data_root}',
167
+ data_cfg=data_cfg,
168
+ valid_class_ids=None,
169
+ max_kpt_num=channel_cfg['max_kpt_num'],
170
+ num_shots=1,
171
+ num_queries=15,
172
+ num_episodes=100,
173
+ pipeline=valid_pipeline),
174
+ test=dict(
175
+ type='TestPoseDataset',
176
+ ann_file=f'{data_root}/annotations/mp100_split3_test.json',
177
+ img_prefix=f'{data_root}/images/',
178
+ # img_prefix=f'{data_root}',
179
+ data_cfg=data_cfg,
180
+ valid_class_ids=None,
181
+ max_kpt_num=channel_cfg['max_kpt_num'],
182
+ num_shots=1,
183
+ num_queries=15,
184
+ num_episodes=200,
185
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
186
+ pipeline=test_pipeline),
187
+ )
188
+ vis_backends = [
189
+ dict(type='LocalVisBackend'),
190
+ dict(type='TensorboardVisBackend'),
191
+ ]
192
+ visualizer = dict(
193
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
194
+
195
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-clip/base_split4_config.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained="ViT-B/32",
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=512,
67
+ transformer=dict(
68
+ type='EncoderDecoder',
69
+ d_model=256,
70
+ nhead=8,
71
+ num_encoder_layers=3,
72
+ num_decoder_layers=3,
73
+ dim_feedforward=768,
74
+ dropout=0.1,
75
+ similarity_proj_dim=256,
76
+ dynamic_proj_dim=128,
77
+ activation="relu",
78
+ normalize_before=False,
79
+ return_intermediate_dec=True),
80
+ share_kpt_branch=False,
81
+ num_decoder_layer=3,
82
+ with_heatmap_loss=True,
83
+
84
+ heatmap_loss_weight=2.0,
85
+ support_order_dropout=-1,
86
+ positional_encoding=dict(
87
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
88
+ # training and testing settings
89
+ train_cfg=dict(),
90
+ test_cfg=dict(
91
+ flip_test=False,
92
+ post_process='default',
93
+ shift_heatmap=True,
94
+ modulate_kernel=11))
95
+
96
+ data_cfg = dict(
97
+ image_size=[256, 256],
98
+ heatmap_size=[64, 64],
99
+ num_output_channels=channel_cfg['num_output_channels'],
100
+ num_joints=channel_cfg['dataset_joints'],
101
+ dataset_channel=channel_cfg['dataset_channel'],
102
+ inference_channel=channel_cfg['inference_channel'])
103
+
104
+ train_pipeline = [
105
+ dict(type='LoadImageFromFile'),
106
+ dict(
107
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
108
+ scale_factor=0.15),
109
+ dict(type='TopDownAffineFewShot'),
110
+ dict(type='ToTensor'),
111
+ dict(
112
+ type='NormalizeTensor',
113
+ mean=[0.485, 0.456, 0.406],
114
+ std=[0.229, 0.224, 0.225]),
115
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
116
+ dict(
117
+ type='Collect',
118
+ keys=['img', 'target', 'target_weight'],
119
+ meta_keys=[
120
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
122
+ ]),
123
+ ]
124
+
125
+ valid_pipeline = [
126
+ dict(type='LoadImageFromFile'),
127
+ dict(type='TopDownAffineFewShot'),
128
+ dict(type='ToTensor'),
129
+ dict(
130
+ type='NormalizeTensor',
131
+ mean=[0.485, 0.456, 0.406],
132
+ std=[0.229, 0.224, 0.225]),
133
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
134
+ dict(
135
+ type='Collect',
136
+ keys=['img', 'target', 'target_weight'],
137
+ meta_keys=[
138
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
139
+ 'flip_pairs', 'category_id',
140
+ 'skeleton',
141
+ ]),
142
+ ]
143
+
144
+ test_pipeline = valid_pipeline
145
+
146
+ data_root = 'data/mp100'
147
+ data = dict(
148
+ samples_per_gpu=16,
149
+ workers_per_gpu=16,
150
+ # samples_per_gpu=8,
151
+ # workers_per_gpu=8,
152
+ train=dict(
153
+ type='TransformerPoseDataset',
154
+ ann_file=f'{data_root}/annotations/mp100_split4_train.json',
155
+ img_prefix=f'{data_root}/images/',
156
+ # img_prefix=f'{data_root}',
157
+ data_cfg=data_cfg,
158
+ valid_class_ids=None,
159
+ max_kpt_num=channel_cfg['max_kpt_num'],
160
+ num_shots=1,
161
+ pipeline=train_pipeline),
162
+ val=dict(
163
+ type='TransformerPoseDataset',
164
+ ann_file=f'{data_root}/annotations/mp100_split4_val.json',
165
+ img_prefix=f'{data_root}/images/',
166
+ # img_prefix=f'{data_root}',
167
+ data_cfg=data_cfg,
168
+ valid_class_ids=None,
169
+ max_kpt_num=channel_cfg['max_kpt_num'],
170
+ num_shots=1,
171
+ num_queries=15,
172
+ num_episodes=100,
173
+ pipeline=valid_pipeline),
174
+ test=dict(
175
+ type='TestPoseDataset',
176
+ ann_file=f'{data_root}/annotations/mp100_split4_test.json',
177
+ img_prefix=f'{data_root}/images/',
178
+ # img_prefix=f'{data_root}',
179
+ data_cfg=data_cfg,
180
+ valid_class_ids=None,
181
+ max_kpt_num=channel_cfg['max_kpt_num'],
182
+ num_shots=1,
183
+ num_queries=15,
184
+ num_episodes=200,
185
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
186
+ pipeline=test_pipeline),
187
+ )
188
+ vis_backends = [
189
+ dict(type='LocalVisBackend'),
190
+ dict(type='TensorboardVisBackend'),
191
+ ]
192
+ visualizer = dict(
193
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
194
+
195
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-clip/base_split5_config.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained="ViT-B/32",
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=512,
67
+ transformer=dict(
68
+ type='EncoderDecoder',
69
+ d_model=256,
70
+ nhead=8,
71
+ num_encoder_layers=3,
72
+ num_decoder_layers=3,
73
+ dim_feedforward=768,
74
+ dropout=0.1,
75
+ similarity_proj_dim=256,
76
+ dynamic_proj_dim=128,
77
+ activation="relu",
78
+ normalize_before=False,
79
+ return_intermediate_dec=True),
80
+ share_kpt_branch=False,
81
+ num_decoder_layer=3,
82
+ with_heatmap_loss=True,
83
+
84
+ heatmap_loss_weight=2.0,
85
+ support_order_dropout=-1,
86
+ positional_encoding=dict(
87
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
88
+ # training and testing settings
89
+ train_cfg=dict(),
90
+ test_cfg=dict(
91
+ flip_test=False,
92
+ post_process='default',
93
+ shift_heatmap=True,
94
+ modulate_kernel=11))
95
+
96
+ data_cfg = dict(
97
+ image_size=[256, 256],
98
+ heatmap_size=[64, 64],
99
+ num_output_channels=channel_cfg['num_output_channels'],
100
+ num_joints=channel_cfg['dataset_joints'],
101
+ dataset_channel=channel_cfg['dataset_channel'],
102
+ inference_channel=channel_cfg['inference_channel'])
103
+
104
+ train_pipeline = [
105
+ dict(type='LoadImageFromFile'),
106
+ dict(
107
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
108
+ scale_factor=0.15),
109
+ dict(type='TopDownAffineFewShot'),
110
+ dict(type='ToTensor'),
111
+ dict(
112
+ type='NormalizeTensor',
113
+ mean=[0.485, 0.456, 0.406],
114
+ std=[0.229, 0.224, 0.225]),
115
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
116
+ dict(
117
+ type='Collect',
118
+ keys=['img', 'target', 'target_weight'],
119
+ meta_keys=[
120
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
122
+ ]),
123
+ ]
124
+
125
+ valid_pipeline = [
126
+ dict(type='LoadImageFromFile'),
127
+ dict(type='TopDownAffineFewShot'),
128
+ dict(type='ToTensor'),
129
+ dict(
130
+ type='NormalizeTensor',
131
+ mean=[0.485, 0.456, 0.406],
132
+ std=[0.229, 0.224, 0.225]),
133
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
134
+ dict(
135
+ type='Collect',
136
+ keys=['img', 'target', 'target_weight'],
137
+ meta_keys=[
138
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
139
+ 'flip_pairs', 'category_id',
140
+ 'skeleton',
141
+ ]),
142
+ ]
143
+
144
+ test_pipeline = valid_pipeline
145
+
146
+ data_root = 'data/mp100'
147
+ data = dict(
148
+ samples_per_gpu=16,
149
+ workers_per_gpu=16,
150
+ # samples_per_gpu=8,
151
+ # workers_per_gpu=8,
152
+ train=dict(
153
+ type='TransformerPoseDataset',
154
+ ann_file=f'{data_root}/annotations/mp100_split5_train.json',
155
+ img_prefix=f'{data_root}/images/',
156
+ # img_prefix=f'{data_root}',
157
+ data_cfg=data_cfg,
158
+ valid_class_ids=None,
159
+ max_kpt_num=channel_cfg['max_kpt_num'],
160
+ num_shots=1,
161
+ pipeline=train_pipeline),
162
+ val=dict(
163
+ type='TransformerPoseDataset',
164
+ ann_file=f'{data_root}/annotations/mp100_split5_val.json',
165
+ img_prefix=f'{data_root}/images/',
166
+ # img_prefix=f'{data_root}',
167
+ data_cfg=data_cfg,
168
+ valid_class_ids=None,
169
+ max_kpt_num=channel_cfg['max_kpt_num'],
170
+ num_shots=1,
171
+ num_queries=15,
172
+ num_episodes=100,
173
+ pipeline=valid_pipeline),
174
+ test=dict(
175
+ type='TestPoseDataset',
176
+ ann_file=f'{data_root}/annotations/mp100_split5_test.json',
177
+ img_prefix=f'{data_root}/images/',
178
+ # img_prefix=f'{data_root}',
179
+ data_cfg=data_cfg,
180
+ valid_class_ids=None,
181
+ max_kpt_num=channel_cfg['max_kpt_num'],
182
+ num_shots=1,
183
+ num_queries=15,
184
+ num_episodes=200,
185
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
186
+ pipeline=test_pipeline),
187
+ )
188
+ vis_backends = [
189
+ dict(type='LocalVisBackend'),
190
+ dict(type='TensorboardVisBackend'),
191
+ ]
192
+ visualizer = dict(
193
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
194
+
195
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-clip/graph_split1_config.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ # total_epochs = 1
28
+ log_config = dict(
29
+ interval=50,
30
+ hooks=[
31
+ dict(type='TextLoggerHook'),
32
+ dict(type='TensorboardLoggerHook')
33
+ ])
34
+
35
+ channel_cfg = dict(
36
+ num_output_channels=1,
37
+ dataset_joints=1,
38
+ dataset_channel=[
39
+ [
40
+ 0,
41
+ ],
42
+ ],
43
+ inference_channel=[
44
+ 0,
45
+ ],
46
+ max_kpt_num=100)
47
+
48
+ # model settings
49
+ model = dict(
50
+ type='PoseAnythingModel',
51
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
52
+ text_pretrained="ViT-B/32",
53
+ finetune_text_pretrained=False,
54
+ encoder_config=dict(
55
+ type='SwinTransformerV2',
56
+ embed_dim=96,
57
+ depths=[2, 2, 18, 2],
58
+ num_heads=[3, 6, 12, 24],
59
+ window_size=16,
60
+ drop_path_rate=0.3,
61
+ img_size=256,
62
+ upsample="bilinear"
63
+ ),
64
+ keypoint_head=dict(
65
+ type='PoseHead',
66
+ img_in_channels=768,
67
+ # text_in_channels=768,
68
+ text_in_channels=512,
69
+ transformer=dict(
70
+ type='EncoderDecoder',
71
+ d_model=256,
72
+ nhead=8,
73
+ num_encoder_layers=3,
74
+ num_decoder_layers=3,
75
+ graph_decoder='pre',
76
+ dim_feedforward=768,
77
+ dropout=0.1,
78
+ similarity_proj_dim=256,
79
+ dynamic_proj_dim=128,
80
+ activation="relu",
81
+ normalize_before=False,
82
+ return_intermediate_dec=True),
83
+ share_kpt_branch=False,
84
+ num_decoder_layer=3,
85
+ with_heatmap_loss=True,
86
+
87
+ heatmap_loss_weight=2.0,
88
+ support_order_dropout=-1,
89
+ positional_encoding=dict(
90
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
91
+ # training and testing settings
92
+ train_cfg=dict(),
93
+ test_cfg=dict(
94
+ flip_test=False,
95
+ post_process='default',
96
+ shift_heatmap=True,
97
+ modulate_kernel=11))
98
+
99
+ data_cfg = dict(
100
+ image_size=[256, 256],
101
+ heatmap_size=[64, 64],
102
+ num_output_channels=channel_cfg['num_output_channels'],
103
+ num_joints=channel_cfg['dataset_joints'],
104
+ dataset_channel=channel_cfg['dataset_channel'],
105
+ inference_channel=channel_cfg['inference_channel'])
106
+
107
+ train_pipeline = [
108
+ dict(type='LoadImageFromFile'),
109
+ dict(
110
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
111
+ scale_factor=0.15),
112
+ dict(type='TopDownAffineFewShot'),
113
+ dict(type='ToTensor'),
114
+ dict(
115
+ type='NormalizeTensor',
116
+ mean=[0.485, 0.456, 0.406],
117
+ std=[0.229, 0.224, 0.225]),
118
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
119
+ dict(
120
+ type='Collect',
121
+ keys=['img', 'target', 'target_weight'],
122
+ meta_keys=[
123
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
124
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
125
+ ]),
126
+ ]
127
+
128
+ valid_pipeline = [
129
+ dict(type='LoadImageFromFile'),
130
+ dict(type='TopDownAffineFewShot'),
131
+ dict(type='ToTensor'),
132
+ dict(
133
+ type='NormalizeTensor',
134
+ mean=[0.485, 0.456, 0.406],
135
+ std=[0.229, 0.224, 0.225]),
136
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
137
+ dict(
138
+ type='Collect',
139
+ keys=['img', 'target', 'target_weight'],
140
+ meta_keys=[
141
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
142
+ 'flip_pairs', 'category_id',
143
+ 'skeleton',
144
+ ]),
145
+ ]
146
+
147
+ test_pipeline = valid_pipeline
148
+
149
+ data_root = 'data/mp100'
150
+ data = dict(
151
+ samples_per_gpu=16,
152
+ workers_per_gpu=16,
153
+ # samples_per_gpu=8,
154
+ # workers_per_gpu=8,
155
+ train=dict(
156
+ type='TransformerPoseDataset',
157
+ ann_file=f'{data_root}/annotations/mp100_split1_train.json',
158
+ img_prefix=f'{data_root}/images/',
159
+ # img_prefix=f'{data_root}',
160
+ data_cfg=data_cfg,
161
+ valid_class_ids=None,
162
+ max_kpt_num=channel_cfg['max_kpt_num'],
163
+ num_shots=1,
164
+ pipeline=train_pipeline),
165
+ val=dict(
166
+ type='TransformerPoseDataset',
167
+ ann_file=f'{data_root}/annotations/mp100_split1_val.json',
168
+ img_prefix=f'{data_root}/images/',
169
+ # img_prefix=f'{data_root}',
170
+ data_cfg=data_cfg,
171
+ valid_class_ids=None,
172
+ max_kpt_num=channel_cfg['max_kpt_num'],
173
+ num_shots=1,
174
+ num_queries=15,
175
+ num_episodes=100,
176
+ pipeline=valid_pipeline),
177
+ test=dict(
178
+ type='TestPoseDataset',
179
+ ann_file=f'{data_root}/annotations/mp100_split1_test.json',
180
+ img_prefix=f'{data_root}/images/',
181
+ # img_prefix=f'{data_root}',
182
+ data_cfg=data_cfg,
183
+ valid_class_ids=None,
184
+ max_kpt_num=channel_cfg['max_kpt_num'],
185
+ num_shots=1,
186
+ num_queries=15,
187
+ num_episodes=200,
188
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
189
+ pipeline=test_pipeline),
190
+ )
191
+ vis_backends = [
192
+ dict(type='LocalVisBackend'),
193
+ dict(type='TensorboardVisBackend'),
194
+ ]
195
+ visualizer = dict(
196
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
197
+
198
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-clip/graph_split2_config.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained="ViT-B/32",
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ # text_in_channels=768,
67
+ text_in_channels=512,
68
+ transformer=dict(
69
+ type='EncoderDecoder',
70
+ d_model=256,
71
+ nhead=8,
72
+ num_encoder_layers=3,
73
+ num_decoder_layers=3,
74
+ graph_decoder='pre',
75
+ dim_feedforward=768,
76
+ dropout=0.1,
77
+ similarity_proj_dim=256,
78
+ dynamic_proj_dim=128,
79
+ activation="relu",
80
+ normalize_before=False,
81
+ return_intermediate_dec=True),
82
+ share_kpt_branch=False,
83
+ num_decoder_layer=3,
84
+ with_heatmap_loss=True,
85
+
86
+ heatmap_loss_weight=2.0,
87
+ support_order_dropout=-1,
88
+ positional_encoding=dict(
89
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
90
+ # training and testing settings
91
+ train_cfg=dict(),
92
+ test_cfg=dict(
93
+ flip_test=False,
94
+ post_process='default',
95
+ shift_heatmap=True,
96
+ modulate_kernel=11))
97
+
98
+ data_cfg = dict(
99
+ image_size=[256, 256],
100
+ heatmap_size=[64, 64],
101
+ num_output_channels=channel_cfg['num_output_channels'],
102
+ num_joints=channel_cfg['dataset_joints'],
103
+ dataset_channel=channel_cfg['dataset_channel'],
104
+ inference_channel=channel_cfg['inference_channel'])
105
+
106
+ train_pipeline = [
107
+ dict(type='LoadImageFromFile'),
108
+ dict(
109
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
110
+ scale_factor=0.15),
111
+ dict(type='TopDownAffineFewShot'),
112
+ dict(type='ToTensor'),
113
+ dict(
114
+ type='NormalizeTensor',
115
+ mean=[0.485, 0.456, 0.406],
116
+ std=[0.229, 0.224, 0.225]),
117
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
118
+ dict(
119
+ type='Collect',
120
+ keys=['img', 'target', 'target_weight'],
121
+ meta_keys=[
122
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
123
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
124
+ ]),
125
+ ]
126
+
127
+ valid_pipeline = [
128
+ dict(type='LoadImageFromFile'),
129
+ dict(type='TopDownAffineFewShot'),
130
+ dict(type='ToTensor'),
131
+ dict(
132
+ type='NormalizeTensor',
133
+ mean=[0.485, 0.456, 0.406],
134
+ std=[0.229, 0.224, 0.225]),
135
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
136
+ dict(
137
+ type='Collect',
138
+ keys=['img', 'target', 'target_weight'],
139
+ meta_keys=[
140
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
141
+ 'flip_pairs', 'category_id',
142
+ 'skeleton',
143
+ ]),
144
+ ]
145
+
146
+ test_pipeline = valid_pipeline
147
+
148
+ data_root = 'data/mp100'
149
+ data = dict(
150
+ samples_per_gpu=16,
151
+ workers_per_gpu=16,
152
+ # samples_per_gpu=8,
153
+ # workers_per_gpu=8,
154
+ train=dict(
155
+ type='TransformerPoseDataset',
156
+ ann_file=f'{data_root}/annotations/mp100_split2_train.json',
157
+ img_prefix=f'{data_root}/images/',
158
+ # img_prefix=f'{data_root}',
159
+ data_cfg=data_cfg,
160
+ valid_class_ids=None,
161
+ max_kpt_num=channel_cfg['max_kpt_num'],
162
+ num_shots=1,
163
+ pipeline=train_pipeline),
164
+ val=dict(
165
+ type='TransformerPoseDataset',
166
+ ann_file=f'{data_root}/annotations/mp100_split2_val.json',
167
+ img_prefix=f'{data_root}/images/',
168
+ # img_prefix=f'{data_root}',
169
+ data_cfg=data_cfg,
170
+ valid_class_ids=None,
171
+ max_kpt_num=channel_cfg['max_kpt_num'],
172
+ num_shots=1,
173
+ num_queries=15,
174
+ num_episodes=100,
175
+ pipeline=valid_pipeline),
176
+ test=dict(
177
+ type='TestPoseDataset',
178
+ ann_file=f'{data_root}/annotations/mp100_split2_test.json',
179
+ img_prefix=f'{data_root}/images/',
180
+ # img_prefix=f'{data_root}',
181
+ data_cfg=data_cfg,
182
+ valid_class_ids=None,
183
+ max_kpt_num=channel_cfg['max_kpt_num'],
184
+ num_shots=1,
185
+ num_queries=15,
186
+ num_episodes=200,
187
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
188
+ pipeline=test_pipeline),
189
+ )
190
+ vis_backends = [
191
+ dict(type='LocalVisBackend'),
192
+ dict(type='TensorboardVisBackend'),
193
+ ]
194
+ visualizer = dict(
195
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
196
+
197
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-clip/graph_split3_config.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained="ViT-B/32",
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ # text_in_channels=768,
67
+ text_in_channels=512,
68
+ transformer=dict(
69
+ type='EncoderDecoder',
70
+ d_model=256,
71
+ nhead=8,
72
+ num_encoder_layers=3,
73
+ num_decoder_layers=3,
74
+ graph_decoder='pre',
75
+ dim_feedforward=768,
76
+ dropout=0.1,
77
+ similarity_proj_dim=256,
78
+ dynamic_proj_dim=128,
79
+ activation="relu",
80
+ normalize_before=False,
81
+ return_intermediate_dec=True),
82
+ share_kpt_branch=False,
83
+ num_decoder_layer=3,
84
+ with_heatmap_loss=True,
85
+
86
+ heatmap_loss_weight=2.0,
87
+ support_order_dropout=-1,
88
+ positional_encoding=dict(
89
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
90
+ # training and testing settings
91
+ train_cfg=dict(),
92
+ test_cfg=dict(
93
+ flip_test=False,
94
+ post_process='default',
95
+ shift_heatmap=True,
96
+ modulate_kernel=11))
97
+
98
+ data_cfg = dict(
99
+ image_size=[256, 256],
100
+ heatmap_size=[64, 64],
101
+ num_output_channels=channel_cfg['num_output_channels'],
102
+ num_joints=channel_cfg['dataset_joints'],
103
+ dataset_channel=channel_cfg['dataset_channel'],
104
+ inference_channel=channel_cfg['inference_channel'])
105
+
106
+ train_pipeline = [
107
+ dict(type='LoadImageFromFile'),
108
+ dict(
109
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
110
+ scale_factor=0.15),
111
+ dict(type='TopDownAffineFewShot'),
112
+ dict(type='ToTensor'),
113
+ dict(
114
+ type='NormalizeTensor',
115
+ mean=[0.485, 0.456, 0.406],
116
+ std=[0.229, 0.224, 0.225]),
117
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
118
+ dict(
119
+ type='Collect',
120
+ keys=['img', 'target', 'target_weight'],
121
+ meta_keys=[
122
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
123
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
124
+ ]),
125
+ ]
126
+
127
+ valid_pipeline = [
128
+ dict(type='LoadImageFromFile'),
129
+ dict(type='TopDownAffineFewShot'),
130
+ dict(type='ToTensor'),
131
+ dict(
132
+ type='NormalizeTensor',
133
+ mean=[0.485, 0.456, 0.406],
134
+ std=[0.229, 0.224, 0.225]),
135
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
136
+ dict(
137
+ type='Collect',
138
+ keys=['img', 'target', 'target_weight'],
139
+ meta_keys=[
140
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
141
+ 'flip_pairs', 'category_id',
142
+ 'skeleton',
143
+ ]),
144
+ ]
145
+
146
+ test_pipeline = valid_pipeline
147
+
148
+ data_root = 'data/mp100'
149
+ data = dict(
150
+ samples_per_gpu=16,
151
+ workers_per_gpu=16,
152
+ # samples_per_gpu=8,
153
+ # workers_per_gpu=8,
154
+ train=dict(
155
+ type='TransformerPoseDataset',
156
+ ann_file=f'{data_root}/annotations/mp100_split3_train.json',
157
+ img_prefix=f'{data_root}/images/',
158
+ # img_prefix=f'{data_root}',
159
+ data_cfg=data_cfg,
160
+ valid_class_ids=None,
161
+ max_kpt_num=channel_cfg['max_kpt_num'],
162
+ num_shots=1,
163
+ pipeline=train_pipeline),
164
+ val=dict(
165
+ type='TransformerPoseDataset',
166
+ ann_file=f'{data_root}/annotations/mp100_split3_val.json',
167
+ img_prefix=f'{data_root}/images/',
168
+ # img_prefix=f'{data_root}',
169
+ data_cfg=data_cfg,
170
+ valid_class_ids=None,
171
+ max_kpt_num=channel_cfg['max_kpt_num'],
172
+ num_shots=1,
173
+ num_queries=15,
174
+ num_episodes=100,
175
+ pipeline=valid_pipeline),
176
+ test=dict(
177
+ type='TestPoseDataset',
178
+ ann_file=f'{data_root}/annotations/mp100_split3_test.json',
179
+ img_prefix=f'{data_root}/images/',
180
+ # img_prefix=f'{data_root}',
181
+ data_cfg=data_cfg,
182
+ valid_class_ids=None,
183
+ max_kpt_num=channel_cfg['max_kpt_num'],
184
+ num_shots=1,
185
+ num_queries=15,
186
+ num_episodes=200,
187
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
188
+ pipeline=test_pipeline),
189
+ )
190
+ vis_backends = [
191
+ dict(type='LocalVisBackend'),
192
+ dict(type='TensorboardVisBackend'),
193
+ ]
194
+ visualizer = dict(
195
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
196
+
197
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-clip/graph_split4_config.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained="ViT-B/32",
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ # text_in_channels=768,
67
+ text_in_channels=512,
68
+ transformer=dict(
69
+ type='EncoderDecoder',
70
+ d_model=256,
71
+ nhead=8,
72
+ num_encoder_layers=3,
73
+ num_decoder_layers=3,
74
+ graph_decoder='pre',
75
+ dim_feedforward=768,
76
+ dropout=0.1,
77
+ similarity_proj_dim=256,
78
+ dynamic_proj_dim=128,
79
+ activation="relu",
80
+ normalize_before=False,
81
+ return_intermediate_dec=True),
82
+ share_kpt_branch=False,
83
+ num_decoder_layer=3,
84
+ with_heatmap_loss=True,
85
+
86
+ heatmap_loss_weight=2.0,
87
+ support_order_dropout=-1,
88
+ positional_encoding=dict(
89
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
90
+ # training and testing settings
91
+ train_cfg=dict(),
92
+ test_cfg=dict(
93
+ flip_test=False,
94
+ post_process='default',
95
+ shift_heatmap=True,
96
+ modulate_kernel=11))
97
+
98
+ data_cfg = dict(
99
+ image_size=[256, 256],
100
+ heatmap_size=[64, 64],
101
+ num_output_channels=channel_cfg['num_output_channels'],
102
+ num_joints=channel_cfg['dataset_joints'],
103
+ dataset_channel=channel_cfg['dataset_channel'],
104
+ inference_channel=channel_cfg['inference_channel'])
105
+
106
+ train_pipeline = [
107
+ dict(type='LoadImageFromFile'),
108
+ dict(
109
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
110
+ scale_factor=0.15),
111
+ dict(type='TopDownAffineFewShot'),
112
+ dict(type='ToTensor'),
113
+ dict(
114
+ type='NormalizeTensor',
115
+ mean=[0.485, 0.456, 0.406],
116
+ std=[0.229, 0.224, 0.225]),
117
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
118
+ dict(
119
+ type='Collect',
120
+ keys=['img', 'target', 'target_weight'],
121
+ meta_keys=[
122
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
123
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
124
+ ]),
125
+ ]
126
+
127
+ valid_pipeline = [
128
+ dict(type='LoadImageFromFile'),
129
+ dict(type='TopDownAffineFewShot'),
130
+ dict(type='ToTensor'),
131
+ dict(
132
+ type='NormalizeTensor',
133
+ mean=[0.485, 0.456, 0.406],
134
+ std=[0.229, 0.224, 0.225]),
135
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
136
+ dict(
137
+ type='Collect',
138
+ keys=['img', 'target', 'target_weight'],
139
+ meta_keys=[
140
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
141
+ 'flip_pairs', 'category_id',
142
+ 'skeleton',
143
+ ]),
144
+ ]
145
+
146
+ test_pipeline = valid_pipeline
147
+
148
+ data_root = 'data/mp100'
149
+ data = dict(
150
+ samples_per_gpu=16,
151
+ workers_per_gpu=16,
152
+ # samples_per_gpu=8,
153
+ # workers_per_gpu=8,
154
+ train=dict(
155
+ type='TransformerPoseDataset',
156
+ ann_file=f'{data_root}/annotations/mp100_split4_train.json',
157
+ img_prefix=f'{data_root}/images/',
158
+ # img_prefix=f'{data_root}',
159
+ data_cfg=data_cfg,
160
+ valid_class_ids=None,
161
+ max_kpt_num=channel_cfg['max_kpt_num'],
162
+ num_shots=1,
163
+ pipeline=train_pipeline),
164
+ val=dict(
165
+ type='TransformerPoseDataset',
166
+ ann_file=f'{data_root}/annotations/mp100_split4_val.json',
167
+ img_prefix=f'{data_root}/images/',
168
+ # img_prefix=f'{data_root}',
169
+ data_cfg=data_cfg,
170
+ valid_class_ids=None,
171
+ max_kpt_num=channel_cfg['max_kpt_num'],
172
+ num_shots=1,
173
+ num_queries=15,
174
+ num_episodes=100,
175
+ pipeline=valid_pipeline),
176
+ test=dict(
177
+ type='TestPoseDataset',
178
+ ann_file=f'{data_root}/annotations/mp100_split4_test.json',
179
+ img_prefix=f'{data_root}/images/',
180
+ # img_prefix=f'{data_root}',
181
+ data_cfg=data_cfg,
182
+ valid_class_ids=None,
183
+ max_kpt_num=channel_cfg['max_kpt_num'],
184
+ num_shots=1,
185
+ num_queries=15,
186
+ num_episodes=200,
187
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
188
+ pipeline=test_pipeline),
189
+ )
190
+ vis_backends = [
191
+ dict(type='LocalVisBackend'),
192
+ dict(type='TensorboardVisBackend'),
193
+ ]
194
+ visualizer = dict(
195
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
196
+
197
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-clip/graph_split5_config.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained="ViT-B/32",
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ # text_in_channels=768,
67
+ text_in_channels=512,
68
+ transformer=dict(
69
+ type='EncoderDecoder',
70
+ d_model=256,
71
+ nhead=8,
72
+ num_encoder_layers=3,
73
+ num_decoder_layers=3,
74
+ graph_decoder='pre',
75
+ dim_feedforward=768,
76
+ dropout=0.1,
77
+ similarity_proj_dim=256,
78
+ dynamic_proj_dim=128,
79
+ activation="relu",
80
+ normalize_before=False,
81
+ return_intermediate_dec=True),
82
+ share_kpt_branch=False,
83
+ num_decoder_layer=3,
84
+ with_heatmap_loss=True,
85
+
86
+ heatmap_loss_weight=2.0,
87
+ support_order_dropout=-1,
88
+ positional_encoding=dict(
89
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
90
+ # training and testing settings
91
+ train_cfg=dict(),
92
+ test_cfg=dict(
93
+ flip_test=False,
94
+ post_process='default',
95
+ shift_heatmap=True,
96
+ modulate_kernel=11))
97
+
98
+ data_cfg = dict(
99
+ image_size=[256, 256],
100
+ heatmap_size=[64, 64],
101
+ num_output_channels=channel_cfg['num_output_channels'],
102
+ num_joints=channel_cfg['dataset_joints'],
103
+ dataset_channel=channel_cfg['dataset_channel'],
104
+ inference_channel=channel_cfg['inference_channel'])
105
+
106
+ train_pipeline = [
107
+ dict(type='LoadImageFromFile'),
108
+ dict(
109
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
110
+ scale_factor=0.15),
111
+ dict(type='TopDownAffineFewShot'),
112
+ dict(type='ToTensor'),
113
+ dict(
114
+ type='NormalizeTensor',
115
+ mean=[0.485, 0.456, 0.406],
116
+ std=[0.229, 0.224, 0.225]),
117
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
118
+ dict(
119
+ type='Collect',
120
+ keys=['img', 'target', 'target_weight'],
121
+ meta_keys=[
122
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
123
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
124
+ ]),
125
+ ]
126
+
127
+ valid_pipeline = [
128
+ dict(type='LoadImageFromFile'),
129
+ dict(type='TopDownAffineFewShot'),
130
+ dict(type='ToTensor'),
131
+ dict(
132
+ type='NormalizeTensor',
133
+ mean=[0.485, 0.456, 0.406],
134
+ std=[0.229, 0.224, 0.225]),
135
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
136
+ dict(
137
+ type='Collect',
138
+ keys=['img', 'target', 'target_weight'],
139
+ meta_keys=[
140
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
141
+ 'flip_pairs', 'category_id',
142
+ 'skeleton',
143
+ ]),
144
+ ]
145
+
146
+ test_pipeline = valid_pipeline
147
+
148
+ data_root = 'data/mp100'
149
+ data = dict(
150
+ samples_per_gpu=16,
151
+ workers_per_gpu=16,
152
+ # samples_per_gpu=8,
153
+ # workers_per_gpu=8,
154
+ train=dict(
155
+ type='TransformerPoseDataset',
156
+ ann_file=f'{data_root}/annotations/mp100_split5_train.json',
157
+ img_prefix=f'{data_root}/images/',
158
+ # img_prefix=f'{data_root}',
159
+ data_cfg=data_cfg,
160
+ valid_class_ids=None,
161
+ max_kpt_num=channel_cfg['max_kpt_num'],
162
+ num_shots=1,
163
+ pipeline=train_pipeline),
164
+ val=dict(
165
+ type='TransformerPoseDataset',
166
+ ann_file=f'{data_root}/annotations/mp100_split5_val.json',
167
+ img_prefix=f'{data_root}/images/',
168
+ # img_prefix=f'{data_root}',
169
+ data_cfg=data_cfg,
170
+ valid_class_ids=None,
171
+ max_kpt_num=channel_cfg['max_kpt_num'],
172
+ num_shots=1,
173
+ num_queries=15,
174
+ num_episodes=100,
175
+ pipeline=valid_pipeline),
176
+ test=dict(
177
+ type='TestPoseDataset',
178
+ ann_file=f'{data_root}/annotations/mp100_split5_test.json',
179
+ img_prefix=f'{data_root}/images/',
180
+ # img_prefix=f'{data_root}',
181
+ data_cfg=data_cfg,
182
+ valid_class_ids=None,
183
+ max_kpt_num=channel_cfg['max_kpt_num'],
184
+ num_shots=1,
185
+ num_queries=15,
186
+ num_episodes=200,
187
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
188
+ pipeline=test_pipeline),
189
+ )
190
+ vis_backends = [
191
+ dict(type='LocalVisBackend'),
192
+ dict(type='TensorboardVisBackend'),
193
+ ]
194
+ visualizer = dict(
195
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
196
+
197
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-gte/base_split1_config.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=768,
67
+ transformer=dict(
68
+ type='EncoderDecoder',
69
+ d_model=256,
70
+ nhead=8,
71
+ num_encoder_layers=3,
72
+ num_decoder_layers=3,
73
+ dim_feedforward=768,
74
+ dropout=0.1,
75
+ similarity_proj_dim=256,
76
+ dynamic_proj_dim=128,
77
+ activation="relu",
78
+ normalize_before=False,
79
+ return_intermediate_dec=True),
80
+ share_kpt_branch=False,
81
+ num_decoder_layer=3,
82
+ with_heatmap_loss=True,
83
+
84
+ heatmap_loss_weight=2.0,
85
+ support_order_dropout=-1,
86
+ positional_encoding=dict(
87
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
88
+ # training and testing settings
89
+ train_cfg=dict(),
90
+ test_cfg=dict(
91
+ flip_test=False,
92
+ post_process='default',
93
+ shift_heatmap=True,
94
+ modulate_kernel=11))
95
+
96
+ data_cfg = dict(
97
+ image_size=[256, 256],
98
+ heatmap_size=[64, 64],
99
+ num_output_channels=channel_cfg['num_output_channels'],
100
+ num_joints=channel_cfg['dataset_joints'],
101
+ dataset_channel=channel_cfg['dataset_channel'],
102
+ inference_channel=channel_cfg['inference_channel'])
103
+
104
+ train_pipeline = [
105
+ dict(type='LoadImageFromFile'),
106
+ dict(
107
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
108
+ scale_factor=0.15),
109
+ dict(type='TopDownAffineFewShot'),
110
+ dict(type='ToTensor'),
111
+ dict(
112
+ type='NormalizeTensor',
113
+ mean=[0.485, 0.456, 0.406],
114
+ std=[0.229, 0.224, 0.225]),
115
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
116
+ dict(
117
+ type='Collect',
118
+ keys=['img', 'target', 'target_weight'],
119
+ meta_keys=[
120
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
122
+ ]),
123
+ ]
124
+
125
+ valid_pipeline = [
126
+ dict(type='LoadImageFromFile'),
127
+ dict(type='TopDownAffineFewShot'),
128
+ dict(type='ToTensor'),
129
+ dict(
130
+ type='NormalizeTensor',
131
+ mean=[0.485, 0.456, 0.406],
132
+ std=[0.229, 0.224, 0.225]),
133
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
134
+ dict(
135
+ type='Collect',
136
+ keys=['img', 'target', 'target_weight'],
137
+ meta_keys=[
138
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
139
+ 'flip_pairs', 'category_id',
140
+ 'skeleton',
141
+ ]),
142
+ ]
143
+
144
+ test_pipeline = valid_pipeline
145
+
146
+ data_root = 'data/mp100'
147
+ data = dict(
148
+ samples_per_gpu=16,
149
+ workers_per_gpu=16,
150
+ # samples_per_gpu=8,
151
+ # workers_per_gpu=8,
152
+ train=dict(
153
+ type='TransformerPoseDataset',
154
+ ann_file=f'{data_root}/annotations/mp100_split1_train.json',
155
+ img_prefix=f'{data_root}/images/',
156
+ # img_prefix=f'{data_root}',
157
+ data_cfg=data_cfg,
158
+ valid_class_ids=None,
159
+ max_kpt_num=channel_cfg['max_kpt_num'],
160
+ num_shots=1,
161
+ pipeline=train_pipeline),
162
+ val=dict(
163
+ type='TransformerPoseDataset',
164
+ ann_file=f'{data_root}/annotations/mp100_split1_val.json',
165
+ img_prefix=f'{data_root}/images/',
166
+ # img_prefix=f'{data_root}',
167
+ data_cfg=data_cfg,
168
+ valid_class_ids=None,
169
+ max_kpt_num=channel_cfg['max_kpt_num'],
170
+ num_shots=1,
171
+ num_queries=15,
172
+ num_episodes=100,
173
+ pipeline=valid_pipeline),
174
+ test=dict(
175
+ type='TestPoseDataset',
176
+ ann_file=f'{data_root}/annotations/mp100_split1_test.json',
177
+ img_prefix=f'{data_root}/images/',
178
+ # img_prefix=f'{data_root}',
179
+ data_cfg=data_cfg,
180
+ valid_class_ids=None,
181
+ max_kpt_num=channel_cfg['max_kpt_num'],
182
+ num_shots=1,
183
+ num_queries=15,
184
+ num_episodes=200,
185
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
186
+ pipeline=test_pipeline),
187
+ )
188
+ vis_backends = [
189
+ dict(type='LocalVisBackend'),
190
+ dict(type='TensorboardVisBackend'),
191
+ ]
192
+ visualizer = dict(
193
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
194
+
195
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-gte/base_split2_config.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=768,
67
+ transformer=dict(
68
+ type='EncoderDecoder',
69
+ d_model=256,
70
+ nhead=8,
71
+ num_encoder_layers=3,
72
+ num_decoder_layers=3,
73
+ dim_feedforward=768,
74
+ dropout=0.1,
75
+ similarity_proj_dim=256,
76
+ dynamic_proj_dim=128,
77
+ activation="relu",
78
+ normalize_before=False,
79
+ return_intermediate_dec=True),
80
+ share_kpt_branch=False,
81
+ num_decoder_layer=3,
82
+ with_heatmap_loss=True,
83
+
84
+ heatmap_loss_weight=2.0,
85
+ support_order_dropout=-1,
86
+ positional_encoding=dict(
87
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
88
+ # training and testing settings
89
+ train_cfg=dict(),
90
+ test_cfg=dict(
91
+ flip_test=False,
92
+ post_process='default',
93
+ shift_heatmap=True,
94
+ modulate_kernel=11))
95
+
96
+ data_cfg = dict(
97
+ image_size=[256, 256],
98
+ heatmap_size=[64, 64],
99
+ num_output_channels=channel_cfg['num_output_channels'],
100
+ num_joints=channel_cfg['dataset_joints'],
101
+ dataset_channel=channel_cfg['dataset_channel'],
102
+ inference_channel=channel_cfg['inference_channel'])
103
+
104
+ train_pipeline = [
105
+ dict(type='LoadImageFromFile'),
106
+ dict(
107
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
108
+ scale_factor=0.15),
109
+ dict(type='TopDownAffineFewShot'),
110
+ dict(type='ToTensor'),
111
+ dict(
112
+ type='NormalizeTensor',
113
+ mean=[0.485, 0.456, 0.406],
114
+ std=[0.229, 0.224, 0.225]),
115
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
116
+ dict(
117
+ type='Collect',
118
+ keys=['img', 'target', 'target_weight'],
119
+ meta_keys=[
120
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
122
+ ]),
123
+ ]
124
+
125
+ valid_pipeline = [
126
+ dict(type='LoadImageFromFile'),
127
+ dict(type='TopDownAffineFewShot'),
128
+ dict(type='ToTensor'),
129
+ dict(
130
+ type='NormalizeTensor',
131
+ mean=[0.485, 0.456, 0.406],
132
+ std=[0.229, 0.224, 0.225]),
133
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
134
+ dict(
135
+ type='Collect',
136
+ keys=['img', 'target', 'target_weight'],
137
+ meta_keys=[
138
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
139
+ 'flip_pairs', 'category_id',
140
+ 'skeleton',
141
+ ]),
142
+ ]
143
+
144
+ test_pipeline = valid_pipeline
145
+
146
+ data_root = 'data/mp100'
147
+ data = dict(
148
+ samples_per_gpu=16,
149
+ workers_per_gpu=16,
150
+ # samples_per_gpu=8,
151
+ # workers_per_gpu=8,
152
+ train=dict(
153
+ type='TransformerPoseDataset',
154
+ ann_file=f'{data_root}/annotations/mp100_split2_train.json',
155
+ img_prefix=f'{data_root}/images/',
156
+ # img_prefix=f'{data_root}',
157
+ data_cfg=data_cfg,
158
+ valid_class_ids=None,
159
+ max_kpt_num=channel_cfg['max_kpt_num'],
160
+ num_shots=1,
161
+ pipeline=train_pipeline),
162
+ val=dict(
163
+ type='TransformerPoseDataset',
164
+ ann_file=f'{data_root}/annotations/mp100_split2_val.json',
165
+ img_prefix=f'{data_root}/images/',
166
+ # img_prefix=f'{data_root}',
167
+ data_cfg=data_cfg,
168
+ valid_class_ids=None,
169
+ max_kpt_num=channel_cfg['max_kpt_num'],
170
+ num_shots=1,
171
+ num_queries=15,
172
+ num_episodes=100,
173
+ pipeline=valid_pipeline),
174
+ test=dict(
175
+ type='TestPoseDataset',
176
+ ann_file=f'{data_root}/annotations/mp100_split2_test.json',
177
+ img_prefix=f'{data_root}/images/',
178
+ # img_prefix=f'{data_root}',
179
+ data_cfg=data_cfg,
180
+ valid_class_ids=None,
181
+ max_kpt_num=channel_cfg['max_kpt_num'],
182
+ num_shots=1,
183
+ num_queries=15,
184
+ num_episodes=200,
185
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
186
+ pipeline=test_pipeline),
187
+ )
188
+ vis_backends = [
189
+ dict(type='LocalVisBackend'),
190
+ dict(type='TensorboardVisBackend'),
191
+ ]
192
+ visualizer = dict(
193
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
194
+
195
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-gte/base_split3_config.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=768,
67
+ transformer=dict(
68
+ type='EncoderDecoder',
69
+ d_model=256,
70
+ nhead=8,
71
+ num_encoder_layers=3,
72
+ num_decoder_layers=3,
73
+ dim_feedforward=768,
74
+ dropout=0.1,
75
+ similarity_proj_dim=256,
76
+ dynamic_proj_dim=128,
77
+ activation="relu",
78
+ normalize_before=False,
79
+ return_intermediate_dec=True),
80
+ share_kpt_branch=False,
81
+ num_decoder_layer=3,
82
+ with_heatmap_loss=True,
83
+
84
+ heatmap_loss_weight=2.0,
85
+ support_order_dropout=-1,
86
+ positional_encoding=dict(
87
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
88
+ # training and testing settings
89
+ train_cfg=dict(),
90
+ test_cfg=dict(
91
+ flip_test=False,
92
+ post_process='default',
93
+ shift_heatmap=True,
94
+ modulate_kernel=11))
95
+
96
+ data_cfg = dict(
97
+ image_size=[256, 256],
98
+ heatmap_size=[64, 64],
99
+ num_output_channels=channel_cfg['num_output_channels'],
100
+ num_joints=channel_cfg['dataset_joints'],
101
+ dataset_channel=channel_cfg['dataset_channel'],
102
+ inference_channel=channel_cfg['inference_channel'])
103
+
104
+ train_pipeline = [
105
+ dict(type='LoadImageFromFile'),
106
+ dict(
107
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
108
+ scale_factor=0.15),
109
+ dict(type='TopDownAffineFewShot'),
110
+ dict(type='ToTensor'),
111
+ dict(
112
+ type='NormalizeTensor',
113
+ mean=[0.485, 0.456, 0.406],
114
+ std=[0.229, 0.224, 0.225]),
115
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
116
+ dict(
117
+ type='Collect',
118
+ keys=['img', 'target', 'target_weight'],
119
+ meta_keys=[
120
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
122
+ ]),
123
+ ]
124
+
125
+ valid_pipeline = [
126
+ dict(type='LoadImageFromFile'),
127
+ dict(type='TopDownAffineFewShot'),
128
+ dict(type='ToTensor'),
129
+ dict(
130
+ type='NormalizeTensor',
131
+ mean=[0.485, 0.456, 0.406],
132
+ std=[0.229, 0.224, 0.225]),
133
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
134
+ dict(
135
+ type='Collect',
136
+ keys=['img', 'target', 'target_weight'],
137
+ meta_keys=[
138
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
139
+ 'flip_pairs', 'category_id',
140
+ 'skeleton',
141
+ ]),
142
+ ]
143
+
144
+ test_pipeline = valid_pipeline
145
+
146
+ data_root = 'data/mp100'
147
+ data = dict(
148
+ samples_per_gpu=16,
149
+ workers_per_gpu=16,
150
+ # samples_per_gpu=8,
151
+ # workers_per_gpu=8,
152
+ train=dict(
153
+ type='TransformerPoseDataset',
154
+ ann_file=f'{data_root}/annotations/mp100_split3_train.json',
155
+ img_prefix=f'{data_root}/images/',
156
+ # img_prefix=f'{data_root}',
157
+ data_cfg=data_cfg,
158
+ valid_class_ids=None,
159
+ max_kpt_num=channel_cfg['max_kpt_num'],
160
+ num_shots=1,
161
+ pipeline=train_pipeline),
162
+ val=dict(
163
+ type='TransformerPoseDataset',
164
+ ann_file=f'{data_root}/annotations/mp100_split3_val.json',
165
+ img_prefix=f'{data_root}/images/',
166
+ # img_prefix=f'{data_root}',
167
+ data_cfg=data_cfg,
168
+ valid_class_ids=None,
169
+ max_kpt_num=channel_cfg['max_kpt_num'],
170
+ num_shots=1,
171
+ num_queries=15,
172
+ num_episodes=100,
173
+ pipeline=valid_pipeline),
174
+ test=dict(
175
+ type='TestPoseDataset',
176
+ ann_file=f'{data_root}/annotations/mp100_split3_test.json',
177
+ img_prefix=f'{data_root}/images/',
178
+ # img_prefix=f'{data_root}',
179
+ data_cfg=data_cfg,
180
+ valid_class_ids=None,
181
+ max_kpt_num=channel_cfg['max_kpt_num'],
182
+ num_shots=1,
183
+ num_queries=15,
184
+ num_episodes=200,
185
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
186
+ pipeline=test_pipeline),
187
+ )
188
+ vis_backends = [
189
+ dict(type='LocalVisBackend'),
190
+ dict(type='TensorboardVisBackend'),
191
+ ]
192
+ visualizer = dict(
193
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
194
+
195
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-gte/base_split4_config.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=768,
67
+ transformer=dict(
68
+ type='EncoderDecoder',
69
+ d_model=256,
70
+ nhead=8,
71
+ num_encoder_layers=3,
72
+ num_decoder_layers=3,
73
+ dim_feedforward=768,
74
+ dropout=0.1,
75
+ similarity_proj_dim=256,
76
+ dynamic_proj_dim=128,
77
+ activation="relu",
78
+ normalize_before=False,
79
+ return_intermediate_dec=True),
80
+ share_kpt_branch=False,
81
+ num_decoder_layer=3,
82
+ with_heatmap_loss=True,
83
+
84
+ heatmap_loss_weight=2.0,
85
+ support_order_dropout=-1,
86
+ positional_encoding=dict(
87
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
88
+ # training and testing settings
89
+ train_cfg=dict(),
90
+ test_cfg=dict(
91
+ flip_test=False,
92
+ post_process='default',
93
+ shift_heatmap=True,
94
+ modulate_kernel=11))
95
+
96
+ data_cfg = dict(
97
+ image_size=[256, 256],
98
+ heatmap_size=[64, 64],
99
+ num_output_channels=channel_cfg['num_output_channels'],
100
+ num_joints=channel_cfg['dataset_joints'],
101
+ dataset_channel=channel_cfg['dataset_channel'],
102
+ inference_channel=channel_cfg['inference_channel'])
103
+
104
+ train_pipeline = [
105
+ dict(type='LoadImageFromFile'),
106
+ dict(
107
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
108
+ scale_factor=0.15),
109
+ dict(type='TopDownAffineFewShot'),
110
+ dict(type='ToTensor'),
111
+ dict(
112
+ type='NormalizeTensor',
113
+ mean=[0.485, 0.456, 0.406],
114
+ std=[0.229, 0.224, 0.225]),
115
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
116
+ dict(
117
+ type='Collect',
118
+ keys=['img', 'target', 'target_weight'],
119
+ meta_keys=[
120
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
122
+ ]),
123
+ ]
124
+
125
+ valid_pipeline = [
126
+ dict(type='LoadImageFromFile'),
127
+ dict(type='TopDownAffineFewShot'),
128
+ dict(type='ToTensor'),
129
+ dict(
130
+ type='NormalizeTensor',
131
+ mean=[0.485, 0.456, 0.406],
132
+ std=[0.229, 0.224, 0.225]),
133
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
134
+ dict(
135
+ type='Collect',
136
+ keys=['img', 'target', 'target_weight'],
137
+ meta_keys=[
138
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
139
+ 'flip_pairs', 'category_id',
140
+ 'skeleton',
141
+ ]),
142
+ ]
143
+
144
+ test_pipeline = valid_pipeline
145
+
146
+ data_root = 'data/mp100'
147
+ data = dict(
148
+ samples_per_gpu=16,
149
+ workers_per_gpu=16,
150
+ # samples_per_gpu=8,
151
+ # workers_per_gpu=8,
152
+ train=dict(
153
+ type='TransformerPoseDataset',
154
+ ann_file=f'{data_root}/annotations/mp100_split4_train.json',
155
+ img_prefix=f'{data_root}/images/',
156
+ # img_prefix=f'{data_root}',
157
+ data_cfg=data_cfg,
158
+ valid_class_ids=None,
159
+ max_kpt_num=channel_cfg['max_kpt_num'],
160
+ num_shots=1,
161
+ pipeline=train_pipeline),
162
+ val=dict(
163
+ type='TransformerPoseDataset',
164
+ ann_file=f'{data_root}/annotations/mp100_split4_val.json',
165
+ img_prefix=f'{data_root}/images/',
166
+ # img_prefix=f'{data_root}',
167
+ data_cfg=data_cfg,
168
+ valid_class_ids=None,
169
+ max_kpt_num=channel_cfg['max_kpt_num'],
170
+ num_shots=1,
171
+ num_queries=15,
172
+ num_episodes=100,
173
+ pipeline=valid_pipeline),
174
+ test=dict(
175
+ type='TestPoseDataset',
176
+ ann_file=f'{data_root}/annotations/mp100_split4_test.json',
177
+ img_prefix=f'{data_root}/images/',
178
+ # img_prefix=f'{data_root}',
179
+ data_cfg=data_cfg,
180
+ valid_class_ids=None,
181
+ max_kpt_num=channel_cfg['max_kpt_num'],
182
+ num_shots=1,
183
+ num_queries=15,
184
+ num_episodes=200,
185
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
186
+ pipeline=test_pipeline),
187
+ )
188
+ vis_backends = [
189
+ dict(type='LocalVisBackend'),
190
+ dict(type='TensorboardVisBackend'),
191
+ ]
192
+ visualizer = dict(
193
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
194
+
195
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-gte/base_split5_config.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=768,
67
+ transformer=dict(
68
+ type='EncoderDecoder',
69
+ d_model=256,
70
+ nhead=8,
71
+ num_encoder_layers=3,
72
+ num_decoder_layers=3,
73
+ dim_feedforward=768,
74
+ dropout=0.1,
75
+ similarity_proj_dim=256,
76
+ dynamic_proj_dim=128,
77
+ activation="relu",
78
+ normalize_before=False,
79
+ return_intermediate_dec=True),
80
+ share_kpt_branch=False,
81
+ num_decoder_layer=3,
82
+ with_heatmap_loss=True,
83
+
84
+ heatmap_loss_weight=2.0,
85
+ support_order_dropout=-1,
86
+ positional_encoding=dict(
87
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
88
+ # training and testing settings
89
+ train_cfg=dict(),
90
+ test_cfg=dict(
91
+ flip_test=False,
92
+ post_process='default',
93
+ shift_heatmap=True,
94
+ modulate_kernel=11))
95
+
96
+ data_cfg = dict(
97
+ image_size=[256, 256],
98
+ heatmap_size=[64, 64],
99
+ num_output_channels=channel_cfg['num_output_channels'],
100
+ num_joints=channel_cfg['dataset_joints'],
101
+ dataset_channel=channel_cfg['dataset_channel'],
102
+ inference_channel=channel_cfg['inference_channel'])
103
+
104
+ train_pipeline = [
105
+ dict(type='LoadImageFromFile'),
106
+ dict(
107
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
108
+ scale_factor=0.15),
109
+ dict(type='TopDownAffineFewShot'),
110
+ dict(type='ToTensor'),
111
+ dict(
112
+ type='NormalizeTensor',
113
+ mean=[0.485, 0.456, 0.406],
114
+ std=[0.229, 0.224, 0.225]),
115
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
116
+ dict(
117
+ type='Collect',
118
+ keys=['img', 'target', 'target_weight'],
119
+ meta_keys=[
120
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
121
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
122
+ ]),
123
+ ]
124
+
125
+ valid_pipeline = [
126
+ dict(type='LoadImageFromFile'),
127
+ dict(type='TopDownAffineFewShot'),
128
+ dict(type='ToTensor'),
129
+ dict(
130
+ type='NormalizeTensor',
131
+ mean=[0.485, 0.456, 0.406],
132
+ std=[0.229, 0.224, 0.225]),
133
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
134
+ dict(
135
+ type='Collect',
136
+ keys=['img', 'target', 'target_weight'],
137
+ meta_keys=[
138
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
139
+ 'flip_pairs', 'category_id',
140
+ 'skeleton',
141
+ ]),
142
+ ]
143
+
144
+ test_pipeline = valid_pipeline
145
+
146
+ data_root = 'data/mp100'
147
+ data = dict(
148
+ samples_per_gpu=16,
149
+ workers_per_gpu=16,
150
+ # samples_per_gpu=8,
151
+ # workers_per_gpu=8,
152
+ train=dict(
153
+ type='TransformerPoseDataset',
154
+ ann_file=f'{data_root}/annotations/mp100_split5_train.json',
155
+ img_prefix=f'{data_root}/images/',
156
+ # img_prefix=f'{data_root}',
157
+ data_cfg=data_cfg,
158
+ valid_class_ids=None,
159
+ max_kpt_num=channel_cfg['max_kpt_num'],
160
+ num_shots=1,
161
+ pipeline=train_pipeline),
162
+ val=dict(
163
+ type='TransformerPoseDataset',
164
+ ann_file=f'{data_root}/annotations/mp100_split5_val.json',
165
+ img_prefix=f'{data_root}/images/',
166
+ # img_prefix=f'{data_root}',
167
+ data_cfg=data_cfg,
168
+ valid_class_ids=None,
169
+ max_kpt_num=channel_cfg['max_kpt_num'],
170
+ num_shots=1,
171
+ num_queries=15,
172
+ num_episodes=100,
173
+ pipeline=valid_pipeline),
174
+ test=dict(
175
+ type='TestPoseDataset',
176
+ ann_file=f'{data_root}/annotations/mp100_split5_test.json',
177
+ img_prefix=f'{data_root}/images/',
178
+ # img_prefix=f'{data_root}',
179
+ data_cfg=data_cfg,
180
+ valid_class_ids=None,
181
+ max_kpt_num=channel_cfg['max_kpt_num'],
182
+ num_shots=1,
183
+ num_queries=15,
184
+ num_episodes=200,
185
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
186
+ pipeline=test_pipeline),
187
+ )
188
+ vis_backends = [
189
+ dict(type='LocalVisBackend'),
190
+ dict(type='TensorboardVisBackend'),
191
+ ]
192
+ visualizer = dict(
193
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
194
+
195
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-gte/graph_split1_config.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ # total_epochs = 1
28
+ log_config = dict(
29
+ interval=50,
30
+ hooks=[
31
+ dict(type='TextLoggerHook'),
32
+ dict(type='TensorboardLoggerHook')
33
+ ])
34
+
35
+ channel_cfg = dict(
36
+ num_output_channels=1,
37
+ dataset_joints=1,
38
+ dataset_channel=[
39
+ [
40
+ 0,
41
+ ],
42
+ ],
43
+ inference_channel=[
44
+ 0,
45
+ ],
46
+ max_kpt_num=100)
47
+
48
+ # model settings
49
+ model = dict(
50
+ type='PoseAnythingModel',
51
+ pretrained="swinv2_base",
52
+ #'pretrained/swinv2_small_1k_500k.pth',
53
+ text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
54
+ finetune_text_pretrained=False,
55
+ encoder_config=dict(
56
+ type='SwinTransformerV2',
57
+ embed_dim=96,
58
+ depths=[2, 2, 18, 2],
59
+ num_heads=[3, 6, 12, 24],
60
+ window_size=16,
61
+ drop_path_rate=0.3,
62
+ img_size=256,
63
+ upsample="bilinear"
64
+ ),
65
+ keypoint_head=dict(
66
+ type='PoseHead',
67
+ img_in_channels=768,
68
+ text_in_channels=768,
69
+ # text_in_channels=512,
70
+ transformer=dict(
71
+ type='EncoderDecoder',
72
+ d_model=256,
73
+ nhead=8,
74
+ num_encoder_layers=3,
75
+ num_decoder_layers=3,
76
+ graph_decoder='pre',
77
+ dim_feedforward=768,
78
+ dropout=0.1,
79
+ similarity_proj_dim=256,
80
+ dynamic_proj_dim=128,
81
+ activation="relu",
82
+ normalize_before=False,
83
+ return_intermediate_dec=True),
84
+ share_kpt_branch=False,
85
+ num_decoder_layer=3,
86
+ with_heatmap_loss=True,
87
+
88
+ heatmap_loss_weight=2.0,
89
+ support_order_dropout=-1,
90
+ positional_encoding=dict(
91
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
92
+ # training and testing settings
93
+ train_cfg=dict(),
94
+ test_cfg=dict(
95
+ flip_test=False,
96
+ post_process='default',
97
+ shift_heatmap=True,
98
+ modulate_kernel=11))
99
+
100
+ data_cfg = dict(
101
+ image_size=[256, 256],
102
+ heatmap_size=[64, 64],
103
+ num_output_channels=channel_cfg['num_output_channels'],
104
+ num_joints=channel_cfg['dataset_joints'],
105
+ dataset_channel=channel_cfg['dataset_channel'],
106
+ inference_channel=channel_cfg['inference_channel'])
107
+
108
+ train_pipeline = [
109
+ dict(type='LoadImageFromFile'),
110
+ dict(
111
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
112
+ scale_factor=0.15),
113
+ dict(type='TopDownAffineFewShot'),
114
+ dict(type='ToTensor'),
115
+ dict(
116
+ type='NormalizeTensor',
117
+ mean=[0.485, 0.456, 0.406],
118
+ std=[0.229, 0.224, 0.225]),
119
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
120
+ dict(
121
+ type='Collect',
122
+ keys=['img', 'target', 'target_weight'],
123
+ meta_keys=[
124
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
125
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
126
+ ]),
127
+ ]
128
+
129
+ valid_pipeline = [
130
+ dict(type='LoadImageFromFile'),
131
+ dict(type='TopDownAffineFewShot'),
132
+ dict(type='ToTensor'),
133
+ dict(
134
+ type='NormalizeTensor',
135
+ mean=[0.485, 0.456, 0.406],
136
+ std=[0.229, 0.224, 0.225]),
137
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
138
+ dict(
139
+ type='Collect',
140
+ keys=['img', 'target', 'target_weight'],
141
+ meta_keys=[
142
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
143
+ 'flip_pairs', 'category_id',
144
+ 'skeleton',
145
+ ]),
146
+ ]
147
+
148
+ test_pipeline = valid_pipeline
149
+
150
+ data_root = 'data/mp100'
151
+ data = dict(
152
+ samples_per_gpu=16,
153
+ workers_per_gpu=16,
154
+ # samples_per_gpu=8,
155
+ # workers_per_gpu=8,
156
+ train=dict(
157
+ type='TransformerPoseDataset',
158
+ ann_file=f'{data_root}/annotations/mp100_split1_train.json',
159
+ img_prefix=f'{data_root}/images/',
160
+ # img_prefix=f'{data_root}',
161
+ data_cfg=data_cfg,
162
+ valid_class_ids=None,
163
+ max_kpt_num=channel_cfg['max_kpt_num'],
164
+ num_shots=1,
165
+ pipeline=train_pipeline),
166
+ val=dict(
167
+ type='TransformerPoseDataset',
168
+ ann_file=f'{data_root}/annotations/mp100_split1_val.json',
169
+ img_prefix=f'{data_root}/images/',
170
+ # img_prefix=f'{data_root}',
171
+ data_cfg=data_cfg,
172
+ valid_class_ids=None,
173
+ max_kpt_num=channel_cfg['max_kpt_num'],
174
+ num_shots=1,
175
+ num_queries=15,
176
+ num_episodes=100,
177
+ pipeline=valid_pipeline),
178
+ test=dict(
179
+ type='TestPoseDataset',
180
+ ann_file=f'{data_root}/annotations/mp100_split1_test.json',
181
+ img_prefix=f'{data_root}/images/',
182
+ # img_prefix=f'{data_root}',
183
+ data_cfg=data_cfg,
184
+ valid_class_ids=None,
185
+ max_kpt_num=channel_cfg['max_kpt_num'],
186
+ num_shots=1,
187
+ num_queries=15,
188
+ num_episodes=200,
189
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
190
+ pipeline=test_pipeline),
191
+ )
192
+ vis_backends = [
193
+ dict(type='LocalVisBackend'),
194
+ dict(type='TensorboardVisBackend'),
195
+ ]
196
+ visualizer = dict(
197
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
198
+
199
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-gte/graph_split2_config.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=768,
67
+ # text_in_channels=512,
68
+ transformer=dict(
69
+ type='EncoderDecoder',
70
+ d_model=256,
71
+ nhead=8,
72
+ num_encoder_layers=3,
73
+ num_decoder_layers=3,
74
+ graph_decoder='pre',
75
+ dim_feedforward=768,
76
+ dropout=0.1,
77
+ similarity_proj_dim=256,
78
+ dynamic_proj_dim=128,
79
+ activation="relu",
80
+ normalize_before=False,
81
+ return_intermediate_dec=True),
82
+ share_kpt_branch=False,
83
+ num_decoder_layer=3,
84
+ with_heatmap_loss=True,
85
+
86
+ heatmap_loss_weight=2.0,
87
+ support_order_dropout=-1,
88
+ positional_encoding=dict(
89
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
90
+ # training and testing settings
91
+ train_cfg=dict(),
92
+ test_cfg=dict(
93
+ flip_test=False,
94
+ post_process='default',
95
+ shift_heatmap=True,
96
+ modulate_kernel=11))
97
+
98
+ data_cfg = dict(
99
+ image_size=[256, 256],
100
+ heatmap_size=[64, 64],
101
+ num_output_channels=channel_cfg['num_output_channels'],
102
+ num_joints=channel_cfg['dataset_joints'],
103
+ dataset_channel=channel_cfg['dataset_channel'],
104
+ inference_channel=channel_cfg['inference_channel'])
105
+
106
+ train_pipeline = [
107
+ dict(type='LoadImageFromFile'),
108
+ dict(
109
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
110
+ scale_factor=0.15),
111
+ dict(type='TopDownAffineFewShot'),
112
+ dict(type='ToTensor'),
113
+ dict(
114
+ type='NormalizeTensor',
115
+ mean=[0.485, 0.456, 0.406],
116
+ std=[0.229, 0.224, 0.225]),
117
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
118
+ dict(
119
+ type='Collect',
120
+ keys=['img', 'target', 'target_weight'],
121
+ meta_keys=[
122
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
123
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
124
+ ]),
125
+ ]
126
+
127
+ valid_pipeline = [
128
+ dict(type='LoadImageFromFile'),
129
+ dict(type='TopDownAffineFewShot'),
130
+ dict(type='ToTensor'),
131
+ dict(
132
+ type='NormalizeTensor',
133
+ mean=[0.485, 0.456, 0.406],
134
+ std=[0.229, 0.224, 0.225]),
135
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
136
+ dict(
137
+ type='Collect',
138
+ keys=['img', 'target', 'target_weight'],
139
+ meta_keys=[
140
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
141
+ 'flip_pairs', 'category_id',
142
+ 'skeleton',
143
+ ]),
144
+ ]
145
+
146
+ test_pipeline = valid_pipeline
147
+
148
+ data_root = 'data/mp100'
149
+ data = dict(
150
+ samples_per_gpu=16,
151
+ workers_per_gpu=16,
152
+ # samples_per_gpu=8,
153
+ # workers_per_gpu=8,
154
+ train=dict(
155
+ type='TransformerPoseDataset',
156
+ ann_file=f'{data_root}/annotations/mp100_split2_train.json',
157
+ img_prefix=f'{data_root}/images/',
158
+ # img_prefix=f'{data_root}',
159
+ data_cfg=data_cfg,
160
+ valid_class_ids=None,
161
+ max_kpt_num=channel_cfg['max_kpt_num'],
162
+ num_shots=1,
163
+ pipeline=train_pipeline),
164
+ val=dict(
165
+ type='TransformerPoseDataset',
166
+ ann_file=f'{data_root}/annotations/mp100_split2_val.json',
167
+ img_prefix=f'{data_root}/images/',
168
+ # img_prefix=f'{data_root}',
169
+ data_cfg=data_cfg,
170
+ valid_class_ids=None,
171
+ max_kpt_num=channel_cfg['max_kpt_num'],
172
+ num_shots=1,
173
+ num_queries=15,
174
+ num_episodes=100,
175
+ pipeline=valid_pipeline),
176
+ test=dict(
177
+ type='TestPoseDataset',
178
+ ann_file=f'{data_root}/annotations/mp100_split2_test.json',
179
+ img_prefix=f'{data_root}/images/',
180
+ # img_prefix=f'{data_root}',
181
+ data_cfg=data_cfg,
182
+ valid_class_ids=None,
183
+ max_kpt_num=channel_cfg['max_kpt_num'],
184
+ num_shots=1,
185
+ num_queries=15,
186
+ num_episodes=200,
187
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
188
+ pipeline=test_pipeline),
189
+ )
190
+ vis_backends = [
191
+ dict(type='LocalVisBackend'),
192
+ dict(type='TensorboardVisBackend'),
193
+ ]
194
+ visualizer = dict(
195
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
196
+
197
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-gte/graph_split3_config.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=768,
67
+ # text_in_channels=512,
68
+ transformer=dict(
69
+ type='EncoderDecoder',
70
+ d_model=256,
71
+ nhead=8,
72
+ num_encoder_layers=3,
73
+ num_decoder_layers=3,
74
+ graph_decoder='pre',
75
+ dim_feedforward=768,
76
+ dropout=0.1,
77
+ similarity_proj_dim=256,
78
+ dynamic_proj_dim=128,
79
+ activation="relu",
80
+ normalize_before=False,
81
+ return_intermediate_dec=True),
82
+ share_kpt_branch=False,
83
+ num_decoder_layer=3,
84
+ with_heatmap_loss=True,
85
+
86
+ heatmap_loss_weight=2.0,
87
+ support_order_dropout=-1,
88
+ positional_encoding=dict(
89
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
90
+ # training and testing settings
91
+ train_cfg=dict(),
92
+ test_cfg=dict(
93
+ flip_test=False,
94
+ post_process='default',
95
+ shift_heatmap=True,
96
+ modulate_kernel=11))
97
+
98
+ data_cfg = dict(
99
+ image_size=[256, 256],
100
+ heatmap_size=[64, 64],
101
+ num_output_channels=channel_cfg['num_output_channels'],
102
+ num_joints=channel_cfg['dataset_joints'],
103
+ dataset_channel=channel_cfg['dataset_channel'],
104
+ inference_channel=channel_cfg['inference_channel'])
105
+
106
+ train_pipeline = [
107
+ dict(type='LoadImageFromFile'),
108
+ dict(
109
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
110
+ scale_factor=0.15),
111
+ dict(type='TopDownAffineFewShot'),
112
+ dict(type='ToTensor'),
113
+ dict(
114
+ type='NormalizeTensor',
115
+ mean=[0.485, 0.456, 0.406],
116
+ std=[0.229, 0.224, 0.225]),
117
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
118
+ dict(
119
+ type='Collect',
120
+ keys=['img', 'target', 'target_weight'],
121
+ meta_keys=[
122
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
123
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
124
+ ]),
125
+ ]
126
+
127
+ valid_pipeline = [
128
+ dict(type='LoadImageFromFile'),
129
+ dict(type='TopDownAffineFewShot'),
130
+ dict(type='ToTensor'),
131
+ dict(
132
+ type='NormalizeTensor',
133
+ mean=[0.485, 0.456, 0.406],
134
+ std=[0.229, 0.224, 0.225]),
135
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
136
+ dict(
137
+ type='Collect',
138
+ keys=['img', 'target', 'target_weight'],
139
+ meta_keys=[
140
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
141
+ 'flip_pairs', 'category_id',
142
+ 'skeleton',
143
+ ]),
144
+ ]
145
+
146
+ test_pipeline = valid_pipeline
147
+
148
+ data_root = 'data/mp100'
149
+ data = dict(
150
+ samples_per_gpu=16,
151
+ workers_per_gpu=16,
152
+ # samples_per_gpu=8,
153
+ # workers_per_gpu=8,
154
+ train=dict(
155
+ type='TransformerPoseDataset',
156
+ ann_file=f'{data_root}/annotations/mp100_split3_train.json',
157
+ img_prefix=f'{data_root}/images/',
158
+ # img_prefix=f'{data_root}',
159
+ data_cfg=data_cfg,
160
+ valid_class_ids=None,
161
+ max_kpt_num=channel_cfg['max_kpt_num'],
162
+ num_shots=1,
163
+ pipeline=train_pipeline),
164
+ val=dict(
165
+ type='TransformerPoseDataset',
166
+ ann_file=f'{data_root}/annotations/mp100_split3_val.json',
167
+ img_prefix=f'{data_root}/images/',
168
+ # img_prefix=f'{data_root}',
169
+ data_cfg=data_cfg,
170
+ valid_class_ids=None,
171
+ max_kpt_num=channel_cfg['max_kpt_num'],
172
+ num_shots=1,
173
+ num_queries=15,
174
+ num_episodes=100,
175
+ pipeline=valid_pipeline),
176
+ test=dict(
177
+ type='TestPoseDataset',
178
+ ann_file=f'{data_root}/annotations/mp100_split3_test.json',
179
+ img_prefix=f'{data_root}/images/',
180
+ # img_prefix=f'{data_root}',
181
+ data_cfg=data_cfg,
182
+ valid_class_ids=None,
183
+ max_kpt_num=channel_cfg['max_kpt_num'],
184
+ num_shots=1,
185
+ num_queries=15,
186
+ num_episodes=200,
187
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
188
+ pipeline=test_pipeline),
189
+ )
190
+ vis_backends = [
191
+ dict(type='LocalVisBackend'),
192
+ dict(type='TensorboardVisBackend'),
193
+ ]
194
+ visualizer = dict(
195
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
196
+
197
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-gte/graph_split4_config.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=768,
67
+ # text_in_channels=512,
68
+ transformer=dict(
69
+ type='EncoderDecoder',
70
+ d_model=256,
71
+ nhead=8,
72
+ num_encoder_layers=3,
73
+ num_decoder_layers=3,
74
+ graph_decoder='pre',
75
+ dim_feedforward=768,
76
+ dropout=0.1,
77
+ similarity_proj_dim=256,
78
+ dynamic_proj_dim=128,
79
+ activation="relu",
80
+ normalize_before=False,
81
+ return_intermediate_dec=True),
82
+ share_kpt_branch=False,
83
+ num_decoder_layer=3,
84
+ with_heatmap_loss=True,
85
+
86
+ heatmap_loss_weight=2.0,
87
+ support_order_dropout=-1,
88
+ positional_encoding=dict(
89
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
90
+ # training and testing settings
91
+ train_cfg=dict(),
92
+ test_cfg=dict(
93
+ flip_test=False,
94
+ post_process='default',
95
+ shift_heatmap=True,
96
+ modulate_kernel=11))
97
+
98
+ data_cfg = dict(
99
+ image_size=[256, 256],
100
+ heatmap_size=[64, 64],
101
+ num_output_channels=channel_cfg['num_output_channels'],
102
+ num_joints=channel_cfg['dataset_joints'],
103
+ dataset_channel=channel_cfg['dataset_channel'],
104
+ inference_channel=channel_cfg['inference_channel'])
105
+
106
+ train_pipeline = [
107
+ dict(type='LoadImageFromFile'),
108
+ dict(
109
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
110
+ scale_factor=0.15),
111
+ dict(type='TopDownAffineFewShot'),
112
+ dict(type='ToTensor'),
113
+ dict(
114
+ type='NormalizeTensor',
115
+ mean=[0.485, 0.456, 0.406],
116
+ std=[0.229, 0.224, 0.225]),
117
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
118
+ dict(
119
+ type='Collect',
120
+ keys=['img', 'target', 'target_weight'],
121
+ meta_keys=[
122
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
123
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
124
+ ]),
125
+ ]
126
+
127
+ valid_pipeline = [
128
+ dict(type='LoadImageFromFile'),
129
+ dict(type='TopDownAffineFewShot'),
130
+ dict(type='ToTensor'),
131
+ dict(
132
+ type='NormalizeTensor',
133
+ mean=[0.485, 0.456, 0.406],
134
+ std=[0.229, 0.224, 0.225]),
135
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
136
+ dict(
137
+ type='Collect',
138
+ keys=['img', 'target', 'target_weight'],
139
+ meta_keys=[
140
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
141
+ 'flip_pairs', 'category_id',
142
+ 'skeleton',
143
+ ]),
144
+ ]
145
+
146
+ test_pipeline = valid_pipeline
147
+
148
+ data_root = 'data/mp100'
149
+ data = dict(
150
+ samples_per_gpu=16,
151
+ workers_per_gpu=16,
152
+ # samples_per_gpu=8,
153
+ # workers_per_gpu=8,
154
+ train=dict(
155
+ type='TransformerPoseDataset',
156
+ ann_file=f'{data_root}/annotations/mp100_split4_train.json',
157
+ img_prefix=f'{data_root}/images/',
158
+ # img_prefix=f'{data_root}',
159
+ data_cfg=data_cfg,
160
+ valid_class_ids=None,
161
+ max_kpt_num=channel_cfg['max_kpt_num'],
162
+ num_shots=1,
163
+ pipeline=train_pipeline),
164
+ val=dict(
165
+ type='TransformerPoseDataset',
166
+ ann_file=f'{data_root}/annotations/mp100_split4_val.json',
167
+ img_prefix=f'{data_root}/images/',
168
+ # img_prefix=f'{data_root}',
169
+ data_cfg=data_cfg,
170
+ valid_class_ids=None,
171
+ max_kpt_num=channel_cfg['max_kpt_num'],
172
+ num_shots=1,
173
+ num_queries=15,
174
+ num_episodes=100,
175
+ pipeline=valid_pipeline),
176
+ test=dict(
177
+ type='TestPoseDataset',
178
+ ann_file=f'{data_root}/annotations/mp100_split4_test.json',
179
+ img_prefix=f'{data_root}/images/',
180
+ # img_prefix=f'{data_root}',
181
+ data_cfg=data_cfg,
182
+ valid_class_ids=None,
183
+ max_kpt_num=channel_cfg['max_kpt_num'],
184
+ num_shots=1,
185
+ num_queries=15,
186
+ num_episodes=200,
187
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
188
+ pipeline=test_pipeline),
189
+ )
190
+ vis_backends = [
191
+ dict(type='LocalVisBackend'),
192
+ dict(type='TensorboardVisBackend'),
193
+ ]
194
+ visualizer = dict(
195
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
196
+
197
+ shuffle_cfg = dict(interval=1)
configs/1shot-swin-gte/graph_split5_config.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='pretrained/swinv2_small_1k_500k.pth',
51
+ text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
52
+ finetune_text_pretrained=False,
53
+ encoder_config=dict(
54
+ type='SwinTransformerV2',
55
+ embed_dim=96,
56
+ depths=[2, 2, 18, 2],
57
+ num_heads=[3, 6, 12, 24],
58
+ window_size=16,
59
+ drop_path_rate=0.3,
60
+ img_size=256,
61
+ upsample="bilinear"
62
+ ),
63
+ keypoint_head=dict(
64
+ type='PoseHead',
65
+ img_in_channels=768,
66
+ text_in_channels=768,
67
+ # text_in_channels=512,
68
+ transformer=dict(
69
+ type='EncoderDecoder',
70
+ d_model=256,
71
+ nhead=8,
72
+ num_encoder_layers=3,
73
+ num_decoder_layers=3,
74
+ graph_decoder='pre',
75
+ dim_feedforward=768,
76
+ dropout=0.1,
77
+ similarity_proj_dim=256,
78
+ dynamic_proj_dim=128,
79
+ activation="relu",
80
+ normalize_before=False,
81
+ return_intermediate_dec=True),
82
+ share_kpt_branch=False,
83
+ num_decoder_layer=3,
84
+ with_heatmap_loss=True,
85
+
86
+ heatmap_loss_weight=2.0,
87
+ support_order_dropout=-1,
88
+ positional_encoding=dict(
89
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
90
+ # training and testing settings
91
+ train_cfg=dict(),
92
+ test_cfg=dict(
93
+ flip_test=False,
94
+ post_process='default',
95
+ shift_heatmap=True,
96
+ modulate_kernel=11))
97
+
98
+ data_cfg = dict(
99
+ image_size=[256, 256],
100
+ heatmap_size=[64, 64],
101
+ num_output_channels=channel_cfg['num_output_channels'],
102
+ num_joints=channel_cfg['dataset_joints'],
103
+ dataset_channel=channel_cfg['dataset_channel'],
104
+ inference_channel=channel_cfg['inference_channel'])
105
+
106
+ train_pipeline = [
107
+ dict(type='LoadImageFromFile'),
108
+ dict(
109
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
110
+ scale_factor=0.15),
111
+ dict(type='TopDownAffineFewShot'),
112
+ dict(type='ToTensor'),
113
+ dict(
114
+ type='NormalizeTensor',
115
+ mean=[0.485, 0.456, 0.406],
116
+ std=[0.229, 0.224, 0.225]),
117
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
118
+ dict(
119
+ type='Collect',
120
+ keys=['img', 'target', 'target_weight'],
121
+ meta_keys=[
122
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
123
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
124
+ ]),
125
+ ]
126
+
127
+ valid_pipeline = [
128
+ dict(type='LoadImageFromFile'),
129
+ dict(type='TopDownAffineFewShot'),
130
+ dict(type='ToTensor'),
131
+ dict(
132
+ type='NormalizeTensor',
133
+ mean=[0.485, 0.456, 0.406],
134
+ std=[0.229, 0.224, 0.225]),
135
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
136
+ dict(
137
+ type='Collect',
138
+ keys=['img', 'target', 'target_weight'],
139
+ meta_keys=[
140
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
141
+ 'flip_pairs', 'category_id',
142
+ 'skeleton',
143
+ ]),
144
+ ]
145
+
146
+ test_pipeline = valid_pipeline
147
+
148
+ data_root = 'data/mp100'
149
+ data = dict(
150
+ samples_per_gpu=16,
151
+ workers_per_gpu=16,
152
+ # samples_per_gpu=8,
153
+ # workers_per_gpu=8,
154
+ train=dict(
155
+ type='TransformerPoseDataset',
156
+ ann_file=f'{data_root}/annotations/mp100_split5_train.json',
157
+ img_prefix=f'{data_root}/images/',
158
+ # img_prefix=f'{data_root}',
159
+ data_cfg=data_cfg,
160
+ valid_class_ids=None,
161
+ max_kpt_num=channel_cfg['max_kpt_num'],
162
+ num_shots=1,
163
+ pipeline=train_pipeline),
164
+ val=dict(
165
+ type='TransformerPoseDataset',
166
+ ann_file=f'{data_root}/annotations/mp100_split5_val.json',
167
+ img_prefix=f'{data_root}/images/',
168
+ # img_prefix=f'{data_root}',
169
+ data_cfg=data_cfg,
170
+ valid_class_ids=None,
171
+ max_kpt_num=channel_cfg['max_kpt_num'],
172
+ num_shots=1,
173
+ num_queries=15,
174
+ num_episodes=100,
175
+ pipeline=valid_pipeline),
176
+ test=dict(
177
+ type='TestPoseDataset',
178
+ ann_file=f'{data_root}/annotations/mp100_split5_test.json',
179
+ img_prefix=f'{data_root}/images/',
180
+ # img_prefix=f'{data_root}',
181
+ data_cfg=data_cfg,
182
+ valid_class_ids=None,
183
+ max_kpt_num=channel_cfg['max_kpt_num'],
184
+ num_shots=1,
185
+ num_queries=15,
186
+ num_episodes=200,
187
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
188
+ pipeline=test_pipeline),
189
+ )
190
+ vis_backends = [
191
+ dict(type='LocalVisBackend'),
192
+ dict(type='TensorboardVisBackend'),
193
+ ]
194
+ visualizer = dict(
195
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
196
+
197
+ shuffle_cfg = dict(interval=1)
configs/_base_/datasets/ap10k.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_info = dict(
2
+ dataset_name='ap10k',
3
+ paper_info=dict(
4
+ author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
5
+ 'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
6
+ title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
7
+ container='35th Conference on Neural Information Processing Systems '
8
+ '(NeurIPS 2021) Track on Datasets and Bench-marks.',
9
+ year='2021',
10
+ homepage='https://github.com/AlexTheBad/AP-10K',
11
+ ),
12
+ keypoint_info={
13
+ 0:
14
+ dict(
15
+ name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
16
+ 1:
17
+ dict(
18
+ name='R_Eye',
19
+ id=1,
20
+ color=[255, 128, 0],
21
+ type='upper',
22
+ swap='L_Eye'),
23
+ 2:
24
+ dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
25
+ 3:
26
+ dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
27
+ 4:
28
+ dict(
29
+ name='Root of tail',
30
+ id=4,
31
+ color=[51, 153, 255],
32
+ type='lower',
33
+ swap=''),
34
+ 5:
35
+ dict(
36
+ name='L_Shoulder',
37
+ id=5,
38
+ color=[51, 153, 255],
39
+ type='upper',
40
+ swap='R_Shoulder'),
41
+ 6:
42
+ dict(
43
+ name='L_Elbow',
44
+ id=6,
45
+ color=[51, 153, 255],
46
+ type='upper',
47
+ swap='R_Elbow'),
48
+ 7:
49
+ dict(
50
+ name='L_F_Paw',
51
+ id=7,
52
+ color=[0, 255, 0],
53
+ type='upper',
54
+ swap='R_F_Paw'),
55
+ 8:
56
+ dict(
57
+ name='R_Shoulder',
58
+ id=8,
59
+ color=[0, 255, 0],
60
+ type='upper',
61
+ swap='L_Shoulder'),
62
+ 9:
63
+ dict(
64
+ name='R_Elbow',
65
+ id=9,
66
+ color=[255, 128, 0],
67
+ type='upper',
68
+ swap='L_Elbow'),
69
+ 10:
70
+ dict(
71
+ name='R_F_Paw',
72
+ id=10,
73
+ color=[0, 255, 0],
74
+ type='lower',
75
+ swap='L_F_Paw'),
76
+ 11:
77
+ dict(
78
+ name='L_Hip',
79
+ id=11,
80
+ color=[255, 128, 0],
81
+ type='lower',
82
+ swap='R_Hip'),
83
+ 12:
84
+ dict(
85
+ name='L_Knee',
86
+ id=12,
87
+ color=[255, 128, 0],
88
+ type='lower',
89
+ swap='R_Knee'),
90
+ 13:
91
+ dict(
92
+ name='L_B_Paw',
93
+ id=13,
94
+ color=[0, 255, 0],
95
+ type='lower',
96
+ swap='R_B_Paw'),
97
+ 14:
98
+ dict(
99
+ name='R_Hip', id=14, color=[0, 255, 0], type='lower',
100
+ swap='L_Hip'),
101
+ 15:
102
+ dict(
103
+ name='R_Knee',
104
+ id=15,
105
+ color=[0, 255, 0],
106
+ type='lower',
107
+ swap='L_Knee'),
108
+ 16:
109
+ dict(
110
+ name='R_B_Paw',
111
+ id=16,
112
+ color=[0, 255, 0],
113
+ type='lower',
114
+ swap='L_B_Paw'),
115
+ },
116
+ skeleton_info={
117
+ 0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
118
+ 1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
119
+ 2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
120
+ 3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
121
+ 4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
122
+ 5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
123
+ 6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
124
+ 7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
125
+ 8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
126
+ 9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
127
+ 10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
128
+ 11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
129
+ 12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
130
+ 13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
131
+ 14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
132
+ 15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
133
+ 16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
134
+ },
135
+ joint_weights=[
136
+ 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
137
+ 1.5
138
+ ],
139
+ sigmas=[
140
+ 0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
141
+ 0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
142
+ ])
configs/_base_/default_runtime.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoint_config = dict(interval=10)
2
+
3
+ log_config = dict(
4
+ interval=50,
5
+ hooks=[
6
+ dict(type='TextLoggerHook'),
7
+ # dict(type='TensorboardLoggerHook')
8
+ # dict(type='PaviLoggerHook') # for internal services
9
+ ])
10
+
11
+ log_level = 'INFO'
12
+ load_from = None
13
+ resume_from = None
14
+ dist_params = dict(backend='nccl')
15
+ workflow = [('train', 1)]
16
+
17
+ # disable opencv multithreading to avoid system being overloaded
18
+ opencv_num_threads = 0
19
+ # set multi-process start method as `fork` to speed up the training
20
+ mp_start_method = 'fork'
configs/demo_b.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_level = 'INFO'
2
+ load_from = None
3
+ resume_from = None
4
+ dist_params = dict(backend='nccl')
5
+ workflow = [('train', 1)]
6
+ checkpoint_config = dict(interval=20)
7
+ evaluation = dict(
8
+ interval=25,
9
+ metric=['PCK', 'NME', 'AUC', 'EPE'],
10
+ key_indicator='PCK',
11
+ gpu_collect=True,
12
+ res_folder='')
13
+ optimizer = dict(
14
+ type='Adam',
15
+ lr=1e-5,
16
+ )
17
+
18
+ optimizer_config = dict(grad_clip=None)
19
+ # learning policy
20
+ lr_config = dict(
21
+ policy='step',
22
+ warmup='linear',
23
+ warmup_iters=1000,
24
+ warmup_ratio=0.001,
25
+ step=[160, 180])
26
+ total_epochs = 200
27
+ log_config = dict(
28
+ interval=50,
29
+ hooks=[
30
+ dict(type='TextLoggerHook'),
31
+ dict(type='TensorboardLoggerHook')
32
+ ])
33
+
34
+ channel_cfg = dict(
35
+ num_output_channels=1,
36
+ dataset_joints=1,
37
+ dataset_channel=[
38
+ [
39
+ 0,
40
+ ],
41
+ ],
42
+ inference_channel=[
43
+ 0,
44
+ ],
45
+ max_kpt_num=100)
46
+
47
+ # model settings
48
+ model = dict(
49
+ type='PoseAnythingModel',
50
+ pretrained='swinv2_small',
51
+ encoder_config=dict(
52
+ type='SwinTransformerV2',
53
+ embed_dim=96,
54
+ depths=[2, 2, 18, 2],
55
+ num_heads=[3, 6, 12, 24],
56
+ window_size=16,
57
+ drop_path_rate=0.3,
58
+ img_size=256,
59
+ upsample="bilinear"
60
+ ),
61
+ keypoint_head=dict(
62
+ type='PoseHead',
63
+ in_channels=768,
64
+ transformer=dict(
65
+ type='EncoderDecoder',
66
+ d_model=256,
67
+ nhead=8,
68
+ num_encoder_layers=3,
69
+ num_decoder_layers=3,
70
+ graph_decoder='pre',
71
+ dim_feedforward=768,
72
+ dropout=0.1,
73
+ similarity_proj_dim=256,
74
+ dynamic_proj_dim=128,
75
+ activation="relu",
76
+ normalize_before=False,
77
+ return_intermediate_dec=True),
78
+ share_kpt_branch=False,
79
+ num_decoder_layer=3,
80
+ with_heatmap_loss=True,
81
+
82
+ heatmap_loss_weight=2.0,
83
+ support_order_dropout=-1,
84
+ positional_encoding=dict(
85
+ type='SinePositionalEncoding', num_feats=128, normalize=True)),
86
+ # training and testing settings
87
+ train_cfg=dict(),
88
+ test_cfg=dict(
89
+ flip_test=False,
90
+ post_process='default',
91
+ shift_heatmap=True,
92
+ modulate_kernel=11))
93
+
94
+ data_cfg = dict(
95
+ image_size=[256, 256],
96
+ heatmap_size=[64, 64],
97
+ num_output_channels=channel_cfg['num_output_channels'],
98
+ num_joints=channel_cfg['dataset_joints'],
99
+ dataset_channel=channel_cfg['dataset_channel'],
100
+ inference_channel=channel_cfg['inference_channel'])
101
+
102
+ train_pipeline = [
103
+ dict(type='LoadImageFromFile'),
104
+ dict(
105
+ type='TopDownGetRandomScaleRotation', rot_factor=15,
106
+ scale_factor=0.15),
107
+ dict(type='TopDownAffineFewShot'),
108
+ dict(type='ToTensor'),
109
+ dict(
110
+ type='NormalizeTensor',
111
+ mean=[0.485, 0.456, 0.406],
112
+ std=[0.229, 0.224, 0.225]),
113
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
114
+ dict(
115
+ type='Collect',
116
+ keys=['img', 'target', 'target_weight'],
117
+ meta_keys=[
118
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
119
+ 'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
120
+ ]),
121
+ ]
122
+
123
+ valid_pipeline = [
124
+ dict(type='LoadImageFromFile'),
125
+ dict(type='TopDownAffineFewShot'),
126
+ dict(type='ToTensor'),
127
+ dict(
128
+ type='NormalizeTensor',
129
+ mean=[0.485, 0.456, 0.406],
130
+ std=[0.229, 0.224, 0.225]),
131
+ dict(type='TopDownGenerateTargetFewShot', sigma=1),
132
+ dict(
133
+ type='Collect',
134
+ keys=['img', 'target', 'target_weight'],
135
+ meta_keys=[
136
+ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
137
+ 'flip_pairs', 'category_id',
138
+ 'skeleton',
139
+ ]),
140
+ ]
141
+
142
+ test_pipeline = valid_pipeline
143
+
144
+ data_root = 'data/mp100'
145
+ data = dict(
146
+ samples_per_gpu=8,
147
+ workers_per_gpu=8,
148
+ train=dict(
149
+ type='TransformerPoseDataset',
150
+ ann_file=f'{data_root}/annotations/mp100_split1_train.json',
151
+ img_prefix=f'{data_root}/images/',
152
+ # img_prefix=f'{data_root}',
153
+ data_cfg=data_cfg,
154
+ valid_class_ids=None,
155
+ max_kpt_num=channel_cfg['max_kpt_num'],
156
+ num_shots=1,
157
+ pipeline=train_pipeline),
158
+ val=dict(
159
+ type='TransformerPoseDataset',
160
+ ann_file=f'{data_root}/annotations/mp100_split1_val.json',
161
+ img_prefix=f'{data_root}/images/',
162
+ # img_prefix=f'{data_root}',
163
+ data_cfg=data_cfg,
164
+ valid_class_ids=None,
165
+ max_kpt_num=channel_cfg['max_kpt_num'],
166
+ num_shots=1,
167
+ num_queries=15,
168
+ num_episodes=100,
169
+ pipeline=valid_pipeline),
170
+ test=dict(
171
+ type='TestPoseDataset',
172
+ ann_file=f'{data_root}/annotations/mp100_split1_test.json',
173
+ img_prefix=f'{data_root}/images/',
174
+ # img_prefix=f'{data_root}',
175
+ data_cfg=data_cfg,
176
+ valid_class_ids=None,
177
+ max_kpt_num=channel_cfg['max_kpt_num'],
178
+ num_shots=1,
179
+ num_queries=15,
180
+ num_episodes=200,
181
+ pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
182
+ pipeline=test_pipeline),
183
+ )
184
+ vis_backends = [
185
+ dict(type='LocalVisBackend'),
186
+ dict(type='TensorboardVisBackend'),
187
+ ]
188
+ visualizer = dict(
189
+ type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
190
+
191
+ shuffle_cfg = dict(interval=1)
demo_text.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import copy
3
+ import os
4
+ import pickle
5
+ import random
6
+ import cv2
7
+ import numpy as np
8
+ import string
9
+ import torch
10
+ from mmcv import Config, DictAction
11
+ from mmcv.cnn import fuse_conv_bn
12
+ from mmcv.runner import load_checkpoint
13
+ from mmpose.core import wrap_fp16_model
14
+ from mmpose.models import build_posenet
15
+ from torchvision import transforms
16
+ from models import *
17
+ import torchvision.transforms.functional as F
18
+
19
+ from tools.visualization import plot_results, plot_query_results, plot_modified_query
20
+ import ast
21
+ import shutil
22
+
23
+ COLORS = [
24
+ [255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
25
+ [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
26
+ [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
27
+ [255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]]
28
+
29
+ class Resize_Pad:
30
+ def __init__(self, w=256, h=256):
31
+ self.w = w
32
+ self.h = h
33
+
34
+ def __call__(self, image):
35
+ _, w_1, h_1 = image.shape
36
+ ratio_1 = w_1 / h_1
37
+ # check if the original and final aspect ratios are the same within a margin
38
+ if round(ratio_1, 2) != 1:
39
+ # padding to preserve aspect ratio
40
+ if ratio_1 > 1: # Make the image higher
41
+ hp = int(w_1 - h_1)
42
+ hp = hp // 2
43
+ image = F.pad(image, (hp, 0, hp, 0), 0, "constant")
44
+ return F.resize(image, [self.h, self.w])
45
+ else:
46
+ wp = int(h_1 - w_1)
47
+ wp = wp // 2
48
+ image = F.pad(image, (0, wp, 0, wp), 0, "constant")
49
+ return F.resize(image, [self.h, self.w])
50
+ else:
51
+ return F.resize(image, [self.h, self.w])
52
+
53
+
54
+ def transform_keypoints_to_pad_and_resize(keypoints, image_size):
55
+ trans_keypoints = keypoints.clone()
56
+ h, w = image_size[:2]
57
+ ratio_1 = w / h
58
+ if ratio_1 > 1:
59
+ # width is bigger than height - pad height
60
+ hp = int(w - h)
61
+ hp = hp // 2
62
+ trans_keypoints[:, 1] = keypoints[:, 1] + hp
63
+ trans_keypoints *= (256. / w)
64
+ else:
65
+ # height is bigger than width - pad width
66
+ wp = int(image_size[1] - image_size[0])
67
+ wp = wp // 2
68
+ trans_keypoints[:, 0] = keypoints[:, 0] + wp
69
+ trans_keypoints *= (256. / h)
70
+ return trans_keypoints
71
+
72
+
73
+ def parse_args():
74
+ parser = argparse.ArgumentParser(description='Pose Anything Demo')
75
+ parser.add_argument('--support_points', help='support keypoints text descriptions')
76
+ parser.add_argument('--support_skeleton', help='list of keypoints skeleton')
77
+ parser.add_argument('--query', help='Image file')
78
+ parser.add_argument('--config', default=None, help='test config file path')
79
+ parser.add_argument('--checkpoint', default=None, help='checkpoint file')
80
+ parser.add_argument('--outdir', default='output', help='checkpoint file')
81
+
82
+ parser.add_argument(
83
+ '--fuse-conv-bn',
84
+ action='store_true',
85
+ help='Whether to fuse conv and bn, this will slightly increase'
86
+ 'the inference speed')
87
+ parser.add_argument(
88
+ '--cfg-options',
89
+ nargs='+',
90
+ action=DictAction,
91
+ default={},
92
+ help='override some settings in the used config, the key-value pair '
93
+ 'in xxx=yyy format will be merged into config file. For example, '
94
+ "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
95
+ args = parser.parse_args()
96
+ return args
97
+
98
+
99
+ def merge_configs(cfg1, cfg2):
100
+ # Merge cfg2 into cfg1
101
+ # Overwrite cfg1 if repeated, ignore if value is None.
102
+ cfg1 = {} if cfg1 is None else cfg1.copy()
103
+ cfg2 = {} if cfg2 is None else cfg2
104
+ for k, v in cfg2.items():
105
+ if v:
106
+ cfg1[k] = v
107
+ return cfg1
108
+
109
+
110
+ def main():
111
+ random.seed(0)
112
+ np.random.seed(0)
113
+ torch.manual_seed(0)
114
+
115
+ args = parse_args()
116
+ cfg = Config.fromfile(args.config)
117
+
118
+ if args.cfg_options is not None:
119
+ cfg.merge_from_dict(args.cfg_options)
120
+ # set cudnn_benchmark
121
+ if cfg.get('cudnn_benchmark', False):
122
+ torch.backends.cudnn.benchmark = True
123
+ cfg.data.test.test_mode = True
124
+
125
+ os.makedirs(args.outdir, exist_ok=True)
126
+
127
+ # Load data
128
+ point_descriptions = ast.literal_eval(args.support_points)
129
+ query_img = cv2.imread(args.query)
130
+ if query_img is None:
131
+ raise ValueError('Fail to read image')
132
+
133
+ # just a placeholder, we don't have input keypoints
134
+ kp_src = torch.zeros((len(point_descriptions), 2))
135
+
136
+ preprocess = transforms.Compose([
137
+ transforms.ToTensor(),
138
+ transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
139
+ Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)])
140
+
141
+ if args.support_skeleton is not None:
142
+ skeleton = ast.literal_eval(args.support_skeleton)
143
+ if len(skeleton) == 0:
144
+ skeleton = [(0, 0)]
145
+
146
+ model_device = "cuda" if torch.cuda.is_available() else "cpu"
147
+
148
+ query_img = preprocess(query_img).flip(0)[None].to(model_device)
149
+ # Create heatmap from keypoints
150
+ genHeatMap = TopDownGenerateTargetFewShot()
151
+ data_cfg = cfg.data_cfg
152
+ data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size])
153
+ data_cfg['joint_weights'] = None
154
+ data_cfg['use_different_joint_weights'] = False
155
+ kp_src_3d = torch.concatenate((kp_src, torch.zeros(kp_src.shape[0], 1)), dim=-1)
156
+ kp_src_3d_weight = torch.concatenate((torch.ones_like(kp_src), torch.zeros(kp_src.shape[0], 1)), dim=-1)
157
+
158
+ # everything that is related to the support image is used as placeholder
159
+ target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg, kp_src_3d, kp_src_3d_weight, sigma=1)
160
+ target_s = torch.tensor(target_s).float()[None]
161
+ target_weight_s = torch.tensor(target_weight_s).float()[None].to(model_device)
162
+
163
+ data = {
164
+ 'img_s': [0],
165
+ 'img_q': query_img,
166
+ 'target_s': [target_s],
167
+ 'target_weight_s': [target_weight_s],
168
+ 'target_q': None,
169
+ 'target_weight_q': None,
170
+ 'return_loss': False,
171
+ 'img_metas': [{'sample_skeleton': [skeleton],
172
+ 'query_skeleton': skeleton,
173
+ 'sample_point_descriptions': np.array([point_descriptions]),
174
+ 'sample_joints_3d': [kp_src_3d],
175
+ 'query_joints_3d': kp_src_3d,
176
+ 'sample_center': [kp_src.mean(dim=0)],
177
+ 'query_center': kp_src.mean(dim=0),
178
+ 'sample_scale': [kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0]],
179
+ 'query_scale': kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0],
180
+ 'sample_rotation': [0],
181
+ 'query_rotation': 0,
182
+ 'sample_bbox_score': [1],
183
+ 'query_bbox_score': 1,
184
+ 'query_image_file': '',
185
+ 'sample_image_file': [''],
186
+ }]
187
+ }
188
+
189
+ # Load model
190
+ model = build_posenet(cfg.model)
191
+ fp16_cfg = cfg.get('fp16', None)
192
+ if fp16_cfg is not None:
193
+ wrap_fp16_model(model)
194
+ load_checkpoint(model, args.checkpoint, map_location='cpu')
195
+ if args.fuse_conv_bn:
196
+ model = fuse_conv_bn(model)
197
+ model.to(model_device)
198
+ model.eval()
199
+
200
+ with torch.no_grad():
201
+ outputs = model(**data)
202
+
203
+ # visualize results
204
+ vis_q_weight = target_weight_s[0]
205
+ vis_q_image = query_img[0].detach().cpu().numpy().transpose(1, 2, 0)
206
+
207
+ name_idx = plot_query_results(vis_q_image, vis_q_weight, skeleton, torch.tensor(outputs['points']).squeeze(0), out_dir=args.outdir)
208
+ shutil.copyfile(args.query, f'./{args.outdir}/{str(name_idx)}_query_in.png')
209
+
210
+
211
+ if __name__ == '__main__':
212
+ main()
docker/Dockerfile ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG PYTORCH="2.0.1"
2
+ ARG CUDA="11.7"
3
+ ARG CUDNN="8"
4
+
5
+ FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
6
+
7
+ ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
8
+ ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
9
+ ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
10
+ ENV TZ=Asia/Kolkata DEBIAN_FRONTEND=noninteractive
11
+ # To fix GPG key error when running apt-get update
12
+ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
13
+ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
14
+
15
+ RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx\
16
+ && apt-get clean \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
+ # Install xtcocotools
20
+ RUN pip install cython
21
+ RUN pip install xtcocotools
22
+ # Install MMEngine and MMCV
23
+ RUN pip install openmim
24
+ RUN mim install mmengine
25
+ RUN mim install "mmpose==0.28.1"
26
+ RUN mim install "mmcv-full==1.5.3"
27
+ RUN pip install -U torchmetrics timm
28
+ RUN pip install numpy scipy --upgrade
29
+ RUN pip install future tensorboard
30
+
31
+ # some other requirments
32
+ RUN pip install git+https://github.com/openai/CLIP.git
33
+ RUN pip install yapf==0.40.1
34
+ RUN pip install transformers
35
+
36
+ WORKDIR CapeX
37
+
38
+ COPY models CapeX/models
39
+ COPY configs CapeX/configs
40
+ COPY pretrained CapeX/pretrained
41
+ COPY requirements.txt CapeX/
42
+ COPY tools CapeX/tools
43
+ COPY setup.cfg CapeX/
44
+ COPY setup.py CapeX/
45
+ COPY test.py CapeX/
46
+ COPY train.py CapeX/
47
+ COPY README.md CapeX/
48
+ COPY run_me.sh CapeX/
49
+
50
+ RUN mkdir -p CapeX/data/mp100
51
+ WORKDIR CapeX
52
+
53
+ # Install MMPose
54
+ RUN conda clean --all
55
+ ENV FORCE_CUDA="1"
56
+ RUN python setup.py develop
57
+
58
+ #CMD ["bash"]
59
+ #CMD ["/bin/bash", "-c", "chmod +x run_me.sh && ./run_me.sh"]
environment.yml ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: capex
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - defaults
6
+ dependencies:
7
+ - _libgcc_mutex=0.1
8
+ - _openmp_mutex=5.1
9
+ - blas=1.0
10
+ - ca-certificates=2024.3.11
11
+ - cuda-cudart=12.1.105
12
+ - cuda-cupti=12.1.105
13
+ - cuda-libraries=12.1.0
14
+ - cuda-nvrtc=12.1.105
15
+ - cuda-nvtx=12.1.105
16
+ - cuda-opencl=12.4.99
17
+ - cuda-runtime=12.1.0
18
+ - cudatoolkit=11.8.0
19
+ - filelock=3.13.1
20
+ - gmp=6.2.1
21
+ - gmpy2=2.1.2
22
+ - intel-openmp=2023.1.0
23
+ - jinja2=3.1.3
24
+ - ld_impl_linux-64=2.38
25
+ - libcublas=12.1.0.26
26
+ - libcufft=11.0.2.4
27
+ - libcufile=1.9.0.20
28
+ - libcurand=10.3.5.119
29
+ - libcusolver=11.4.4.55
30
+ - libcusparse=12.0.2.55
31
+ - libffi=3.4.4
32
+ - libgcc-ng=11.2.0
33
+ - libgomp=11.2.0
34
+ - libnpp=12.0.2.50
35
+ - libnvjitlink=12.1.105
36
+ - libnvjpeg=12.1.1.14
37
+ - libstdcxx-ng=11.2.0
38
+ - markupsafe=2.1.3
39
+ - mkl=2023.1.0
40
+ - mpc=1.1.0
41
+ - mpfr=4.0.2
42
+ - mpmath=1.3.0
43
+ - ncurses=6.4
44
+ - networkx=3.1
45
+ - openssl=3.0.13
46
+ - pip=23.3.1
47
+ - python=3.8.18
48
+ - pytorch-cuda=12.1
49
+ - pytorch-mutex=1.0
50
+ - readline=8.2
51
+ - setuptools=68.2.2
52
+ - sqlite=3.41.2
53
+ - sympy=1.12
54
+ - tbb=2021.8.0
55
+ - tk=8.6.12
56
+ - typing_extensions=4.9.0
57
+ - wheel=0.41.2
58
+ - xz=5.4.6
59
+ - zlib=1.2.13
60
+ - pip:
61
+ - absl-py==2.1.0
62
+ - addict==2.4.0
63
+ - aiofiles==23.2.1
64
+ - aiohttp==3.9.3
65
+ - aiosignal==1.3.1
66
+ - altair==5.3.0
67
+ - annotated-types==0.6.0
68
+ - antlr4-python3-runtime==4.9.3
69
+ - anyio==4.3.0
70
+ - async-timeout==4.0.3
71
+ - attrs==23.2.0
72
+ - cachetools==5.3.3
73
+ - certifi==2024.2.2
74
+ - charset-normalizer==3.3.2
75
+ - chumpy==0.70
76
+ - click==8.1.7
77
+ - git+https://github.com/openai/CLIP.git
78
+ - contourpy==1.1.1
79
+ - cycler==0.12.1
80
+ - cython==3.0.9
81
+ - dnspython==2.6.1
82
+ - email-validator==2.1.1
83
+ - exceptiongroup==1.2.1
84
+ - fastapi==0.111.0
85
+ - fastapi-cli==0.0.3
86
+ - ffmpy==0.3.2
87
+ - fonttools==4.49.0
88
+ - frozenlist==1.4.1
89
+ - fsspec==2024.2.0
90
+ - ftfy==6.2.0
91
+ - future==1.0.0
92
+ - google-auth==2.28.2
93
+ - google-auth-oauthlib==1.0.0
94
+ - gradio==4.31.0
95
+ - gradio-client==0.16.2
96
+ - grpcio==1.62.1
97
+ - h11==0.14.0
98
+ - httpcore==1.0.5
99
+ - httptools==0.6.1
100
+ - httpx==0.27.0
101
+ - huggingface-hub==0.21.4
102
+ - idna==3.6
103
+ - importlib-metadata==7.0.1
104
+ - importlib-resources==6.1.2
105
+ - joblib==1.4.0
106
+ - json-tricks==3.17.3
107
+ - jsonschema==4.22.0
108
+ - jsonschema-specifications==2023.12.1
109
+ - kiwisolver==1.4.5
110
+ - kornia==0.7.2
111
+ - kornia-rs==0.1.3
112
+ - lightning-utilities==0.11.2
113
+ - markdown==3.5.2
114
+ - markdown-it-py==3.0.0
115
+ - matplotlib==3.7.5
116
+ - mdurl==0.1.2
117
+ - mmcv-full==1.6.2
118
+ - mmpose==0.29.0
119
+ - multidict==6.0.5
120
+ - munkres==1.1.4
121
+ - numpy==1.24.4
122
+ - nvidia-cublas-cu12==12.1.3.1
123
+ - nvidia-cuda-cupti-cu12==12.1.105
124
+ - nvidia-cuda-nvrtc-cu12==12.1.105
125
+ - nvidia-cuda-runtime-cu12==12.1.105
126
+ - nvidia-cudnn-cu12==8.9.2.26
127
+ - nvidia-cufft-cu12==11.0.2.54
128
+ - nvidia-curand-cu12==10.3.2.106
129
+ - nvidia-cusolver-cu12==11.4.5.107
130
+ - nvidia-cusparse-cu12==12.1.0.106
131
+ - nvidia-nccl-cu12==2.19.3
132
+ - nvidia-nvjitlink-cu12==12.4.99
133
+ - nvidia-nvtx-cu12==12.1.105
134
+ - oauthlib==3.2.2
135
+ - omegaconf==2.3.0
136
+ - opencv-python==4.9.0.80
137
+ - orjson==3.10.3
138
+ - packaging==23.2
139
+ - pandas==2.0.3
140
+ - pillow==10.2.0
141
+ - pkgutil-resolve-name==1.3.10
142
+ - platformdirs==4.2.0
143
+ - protobuf==4.25.3
144
+ - pyasn1==0.5.1
145
+ - pyasn1-modules==0.3.0
146
+ - pydantic==2.7.1
147
+ - pydantic-core==2.18.2
148
+ - pydub==0.25.1
149
+ - pygments==2.18.0
150
+ - pyparsing==3.1.2
151
+ - python-dateutil==2.9.0.post0
152
+ - python-dotenv==1.0.1
153
+ - python-multipart==0.0.9
154
+ - pytorch-lightning==2.2.1
155
+ - pytz==2024.1
156
+ - pyyaml==6.0.1
157
+ - referencing==0.35.1
158
+ - regex==2023.12.25
159
+ - requests==2.31.0
160
+ - requests-oauthlib==1.4.0
161
+ - rich==13.7.1
162
+ - rpds-py==0.18.1
163
+ - rsa==4.9
164
+ - ruff==0.4.4
165
+ - safetensors==0.4.2
166
+ - scikit-learn==1.3.2
167
+ - scipy==1.10.1
168
+ - semantic-version==2.10.0
169
+ - sentencepiece==0.2.0
170
+ - shellingham==1.5.4
171
+ - six==1.16.0
172
+ - sniffio==1.3.1
173
+ - starlette==0.37.2
174
+ - tensorboard==2.14.0
175
+ - tensorboard-data-server==0.7.2
176
+ - threadpoolctl==3.4.0
177
+ - timm==0.4.12
178
+ - tokenizers==0.15.2
179
+ - tomli==2.0.1
180
+ - tomlkit==0.12.0
181
+ - toolz==0.12.1
182
+ - torch==2.2.1
183
+ - torchmetrics==1.3.2
184
+ - torchvision==0.17.1
185
+ - tqdm==4.66.2
186
+ - transformers==4.38.2
187
+ - triton==2.2.0
188
+ - typer==0.12.3
189
+ - tzdata==2024.1
190
+ - ujson==5.9.0
191
+ - urllib3==2.2.1
192
+ - uvicorn==0.29.0
193
+ - uvloop==0.19.0
194
+ - watchfiles==0.21.0
195
+ - wcwidth==0.2.13
196
+ - websockets==11.0.3
197
+ - werkzeug==3.0.1
198
+ - xtcocotools==1.14.3
199
+ - yapf==0.40.1
200
+ - yarl==1.9.4
201
+ - zipp==3.17.0
examples/animal.png ADDED
examples/car.png ADDED
examples/chair.png ADDED
examples/person.png ADDED
models/VERSION ADDED
@@ -0,0 +1 @@
 
 
1
+ 0.2.0
models/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .core import * # noqa
2
+ from .datasets import * # noqa
3
+ from .models import * # noqa
models/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (195 Bytes). View file
 
models/apis/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .train import train_model
2
+
3
+ __all__ = [
4
+ 'train_model'
5
+ ]
models/apis/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (206 Bytes). View file
 
models/apis/__pycache__/train.cpython-38.pyc ADDED
Binary file (3.15 kB). View file
 
models/apis/train.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ from models.core.custom_hooks.shuffle_hooks import ShufflePairedSamplesHook
5
+ from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
6
+ from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook,
7
+ build_optimizer)
8
+ from mmpose.core import DistEvalHook, EvalHook, Fp16OptimizerHook
9
+ from mmpose.datasets import build_dataloader
10
+ from mmpose.utils import get_root_logger
11
+
12
+
13
+ def train_model(model,
14
+ dataset,
15
+ val_dataset,
16
+ cfg,
17
+ distributed=False,
18
+ validate=False,
19
+ timestamp=None,
20
+ meta=None):
21
+ """Train model entry function.
22
+
23
+ Args:
24
+ model (nn.Module): The model to be trained.
25
+ dataset (Dataset): Train dataset.
26
+ cfg (dict): The config dict for training.
27
+ distributed (bool): Whether to use distributed training.
28
+ Default: False.
29
+ validate (bool): Whether to do evaluation. Default: False.
30
+ timestamp (str | None): Local time for runner. Default: None.
31
+ meta (dict | None): Meta dict to record some important information.
32
+ Default: None
33
+ """
34
+ logger = get_root_logger(cfg.log_level)
35
+
36
+ # prepare data loaders
37
+ dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
38
+ dataloader_setting = dict(
39
+ samples_per_gpu=cfg.data.get('samples_per_gpu', {}),
40
+ workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
41
+ # cfg.gpus will be ignored if distributed
42
+ num_gpus=len(cfg.gpu_ids),
43
+ dist=distributed,
44
+ seed=cfg.seed,
45
+ pin_memory=False,
46
+ )
47
+ dataloader_setting = dict(dataloader_setting,
48
+ **cfg.data.get('train_dataloader', {}))
49
+
50
+ data_loaders = [
51
+ build_dataloader(ds, **dataloader_setting) for ds in dataset
52
+ ]
53
+
54
+ # put model on gpus
55
+ if distributed:
56
+ find_unused_parameters = cfg.get('find_unused_parameters',
57
+ False) # NOTE: True has been modified to False for faster training.
58
+ # Sets the `find_unused_parameters` parameter in
59
+ # torch.nn.parallel.DistributedDataParallel
60
+ model = MMDistributedDataParallel(
61
+ model.cuda(),
62
+ device_ids=[torch.cuda.current_device()],
63
+ broadcast_buffers=False,
64
+ find_unused_parameters=find_unused_parameters)
65
+ else:
66
+ model = MMDataParallel(
67
+ model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
68
+
69
+ # build runner
70
+ optimizer = build_optimizer(model, cfg.optimizer)
71
+ runner = EpochBasedRunner(
72
+ model,
73
+ optimizer=optimizer,
74
+ work_dir=cfg.work_dir,
75
+ logger=logger,
76
+ meta=meta)
77
+ # an ugly workaround to make .log and .log.json filenames the same
78
+ runner.timestamp = timestamp
79
+
80
+ # fp16 setting
81
+ fp16_cfg = cfg.get('fp16', None)
82
+ if fp16_cfg is not None:
83
+ optimizer_config = Fp16OptimizerHook(
84
+ **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
85
+ elif distributed and 'type' not in cfg.optimizer_config:
86
+ optimizer_config = OptimizerHook(**cfg.optimizer_config)
87
+ else:
88
+ optimizer_config = cfg.optimizer_config
89
+
90
+ # register hooks
91
+ runner.register_training_hooks(cfg.lr_config, optimizer_config,
92
+ cfg.checkpoint_config, cfg.log_config,
93
+ cfg.get('momentum_config', None))
94
+ if distributed:
95
+ runner.register_hook(DistSamplerSeedHook())
96
+
97
+ shuffle_cfg = cfg.get('shuffle_cfg', None)
98
+ if shuffle_cfg is not None:
99
+ for data_loader in data_loaders:
100
+ runner.register_hook(ShufflePairedSamplesHook(data_loader, **shuffle_cfg))
101
+
102
+ # register eval hooks
103
+ if validate:
104
+ eval_cfg = cfg.get('evaluation', {})
105
+ eval_cfg['res_folder'] = os.path.join(cfg.work_dir, eval_cfg['res_folder'])
106
+ dataloader_setting = dict(
107
+ # samples_per_gpu=cfg.data.get('samples_per_gpu', {}),
108
+ samples_per_gpu=1,
109
+ workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
110
+ # cfg.gpus will be ignored if distributed
111
+ num_gpus=len(cfg.gpu_ids),
112
+ dist=distributed,
113
+ shuffle=False,
114
+ pin_memory=False,
115
+ )
116
+ dataloader_setting = dict(dataloader_setting,
117
+ **cfg.data.get('val_dataloader', {}))
118
+ val_dataloader = build_dataloader(val_dataset, **dataloader_setting)
119
+ eval_hook = DistEvalHook if distributed else EvalHook
120
+ runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
121
+
122
+ if cfg.resume_from:
123
+ runner.resume(cfg.resume_from)
124
+ elif cfg.load_from:
125
+ runner.load_checkpoint(cfg.load_from)
126
+ runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
models/core/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
models/core/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (141 Bytes). View file
 
models/core/custom_hooks/__pycache__/shuffle_hooks.cpython-38.pyc ADDED
Binary file (1.26 kB). View file
 
models/core/custom_hooks/shuffle_hooks.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmcv.runner import Hook
2
+ from mmpose.utils import get_root_logger
3
+ from torch.utils.data import DataLoader
4
+
5
+
6
+ class ShufflePairedSamplesHook(Hook):
7
+ """Non-Distributed ShufflePairedSamples.
8
+ After each training epoch, run FewShotKeypointDataset.random_paired_samples()
9
+ """
10
+
11
+ def __init__(self,
12
+ dataloader,
13
+ interval=1):
14
+ if not isinstance(dataloader, DataLoader):
15
+ raise TypeError(f'dataloader must be a pytorch DataLoader, '
16
+ f'but got {type(dataloader)}')
17
+
18
+ self.dataloader = dataloader
19
+ self.interval = interval
20
+ self.logger = get_root_logger()
21
+
22
+ def after_train_epoch(self, runner):
23
+ """Called after every training epoch to evaluate the results."""
24
+ if not self.every_n_epochs(runner, self.interval):
25
+ return
26
+ # self.logger.info("Run random_paired_samples()")
27
+ # self.logger.info(f"Before: {self.dataloader.dataset.paired_samples[0]}")
28
+ self.dataloader.dataset.random_paired_samples()
29
+ # self.logger.info(f"After: {self.dataloader.dataset.paired_samples[0]}")
models/datasets/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .builder import * # noqa
2
+ from .datasets import * # noqa
3
+ from .pipelines import * # noqa
models/datasets/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (210 Bytes). View file
 
models/datasets/__pycache__/builder.cpython-38.pyc ADDED
Binary file (1.92 kB). View file
 
models/datasets/builder.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mmcv.utils import build_from_cfg
2
+ from mmpose.datasets.builder import DATASETS
3
+ from mmpose.datasets.dataset_wrappers import RepeatDataset
4
+ from torch.utils.data.dataset import ConcatDataset
5
+
6
+
7
+ def _concat_cfg(cfg):
8
+ replace = ['ann_file', 'img_prefix']
9
+ channels = ['num_joints', 'dataset_channel']
10
+ concat_cfg = []
11
+ for i in range(len(cfg['type'])):
12
+ cfg_tmp = cfg.deepcopy()
13
+ cfg_tmp['type'] = cfg['type'][i]
14
+ for item in replace:
15
+ assert item in cfg_tmp
16
+ assert len(cfg['type']) == len(cfg[item]), (cfg[item])
17
+ cfg_tmp[item] = cfg[item][i]
18
+ for item in channels:
19
+ assert item in cfg_tmp['data_cfg']
20
+ assert len(cfg['type']) == len(cfg['data_cfg'][item])
21
+ cfg_tmp['data_cfg'][item] = cfg['data_cfg'][item][i]
22
+ concat_cfg.append(cfg_tmp)
23
+ return concat_cfg
24
+
25
+
26
+ def _check_vaild(cfg):
27
+ replace = ['num_joints', 'dataset_channel']
28
+ if isinstance(cfg['data_cfg'][replace[0]], (list, tuple)):
29
+ for item in replace:
30
+ cfg['data_cfg'][item] = cfg['data_cfg'][item][0]
31
+ return cfg
32
+
33
+
34
+ def build_dataset(cfg, default_args=None):
35
+ """Build a dataset from config dict.
36
+
37
+ Args:
38
+ cfg (dict): Config dict. It should at least contain the key "type".
39
+ default_args (dict, optional): Default initialization arguments.
40
+ Default: None.
41
+
42
+ Returns:
43
+ Dataset: The constructed dataset.
44
+ """
45
+ if isinstance(cfg['type'], (list, tuple)): # In training, type=TransformerPoseDataset
46
+ dataset = ConcatDataset(
47
+ [build_dataset(c, default_args) for c in _concat_cfg(cfg)])
48
+ elif cfg['type'] == 'RepeatDataset':
49
+ dataset = RepeatDataset(
50
+ build_dataset(cfg['dataset'], default_args), cfg['times'])
51
+ else:
52
+ cfg = _check_vaild(cfg)
53
+ dataset = build_from_cfg(cfg, DATASETS, default_args)
54
+ return dataset