Spaces:
Runtime error
Runtime error
Commit ·
93b49a4
0
Parent(s):
initial commit
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +35 -0
- .idea/workspace.xml +60 -0
- LICENSE +203 -0
- README.md +14 -0
- app.py +442 -0
- configs/1shot-swin-clip/base_split1_config.py +195 -0
- configs/1shot-swin-clip/base_split2_config.py +195 -0
- configs/1shot-swin-clip/base_split3_config.py +195 -0
- configs/1shot-swin-clip/base_split4_config.py +195 -0
- configs/1shot-swin-clip/base_split5_config.py +195 -0
- configs/1shot-swin-clip/graph_split1_config.py +198 -0
- configs/1shot-swin-clip/graph_split2_config.py +197 -0
- configs/1shot-swin-clip/graph_split3_config.py +197 -0
- configs/1shot-swin-clip/graph_split4_config.py +197 -0
- configs/1shot-swin-clip/graph_split5_config.py +197 -0
- configs/1shot-swin-gte/base_split1_config.py +195 -0
- configs/1shot-swin-gte/base_split2_config.py +195 -0
- configs/1shot-swin-gte/base_split3_config.py +195 -0
- configs/1shot-swin-gte/base_split4_config.py +195 -0
- configs/1shot-swin-gte/base_split5_config.py +195 -0
- configs/1shot-swin-gte/graph_split1_config.py +199 -0
- configs/1shot-swin-gte/graph_split2_config.py +197 -0
- configs/1shot-swin-gte/graph_split3_config.py +197 -0
- configs/1shot-swin-gte/graph_split4_config.py +197 -0
- configs/1shot-swin-gte/graph_split5_config.py +197 -0
- configs/_base_/datasets/ap10k.py +142 -0
- configs/_base_/default_runtime.py +20 -0
- configs/demo_b.py +191 -0
- demo_text.py +212 -0
- docker/Dockerfile +59 -0
- environment.yml +201 -0
- examples/animal.png +0 -0
- examples/car.png +0 -0
- examples/chair.png +0 -0
- examples/person.png +0 -0
- models/VERSION +1 -0
- models/__init__.py +3 -0
- models/__pycache__/__init__.cpython-38.pyc +0 -0
- models/apis/__init__.py +5 -0
- models/apis/__pycache__/__init__.cpython-38.pyc +0 -0
- models/apis/__pycache__/train.cpython-38.pyc +0 -0
- models/apis/train.py +126 -0
- models/core/__init__.py +1 -0
- models/core/__pycache__/__init__.cpython-38.pyc +0 -0
- models/core/custom_hooks/__pycache__/shuffle_hooks.cpython-38.pyc +0 -0
- models/core/custom_hooks/shuffle_hooks.py +29 -0
- models/datasets/__init__.py +3 -0
- models/datasets/__pycache__/__init__.cpython-38.pyc +0 -0
- models/datasets/__pycache__/builder.cpython-38.pyc +0 -0
- models/datasets/builder.py +54 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.idea/workspace.xml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
| 2 |
+
<project version="4">
|
| 3 |
+
<component name="AutoImportSettings">
|
| 4 |
+
<option name="autoReloadType" value="SELECTIVE" />
|
| 5 |
+
</component>
|
| 6 |
+
<component name="ChangeListManager">
|
| 7 |
+
<list default="true" id="1cdeceeb-6b27-4a56-9f44-8bb2d77c353f" name="Changes" comment="" />
|
| 8 |
+
<option name="SHOW_DIALOG" value="false" />
|
| 9 |
+
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
| 10 |
+
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
| 11 |
+
<option name="LAST_RESOLUTION" value="IGNORE" />
|
| 12 |
+
</component>
|
| 13 |
+
<component name="Git.Settings">
|
| 14 |
+
<option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
|
| 15 |
+
</component>
|
| 16 |
+
<component name="ProjectColorInfo"><![CDATA[{
|
| 17 |
+
"associatedIndex": 4
|
| 18 |
+
}]]></component>
|
| 19 |
+
<component name="ProjectId" id="2hKQwKx3zpbH4D8IcAn5ZJcn2HY" />
|
| 20 |
+
<component name="ProjectViewState">
|
| 21 |
+
<option name="hideEmptyMiddlePackages" value="true" />
|
| 22 |
+
<option name="showLibraryContents" value="true" />
|
| 23 |
+
</component>
|
| 24 |
+
<component name="PropertiesComponent"><![CDATA[{
|
| 25 |
+
"keyToString": {
|
| 26 |
+
"RunOnceActivity.ShowReadmeOnStart": "true",
|
| 27 |
+
"git-widget-placeholder": "main",
|
| 28 |
+
"last_opened_file_path": "/home/matanru/huggingface/CapeX",
|
| 29 |
+
"node.js.detected.package.eslint": "true",
|
| 30 |
+
"node.js.detected.package.tslint": "true",
|
| 31 |
+
"node.js.selected.package.eslint": "(autodetect)",
|
| 32 |
+
"node.js.selected.package.tslint": "(autodetect)",
|
| 33 |
+
"nodejs_package_manager_path": "npm",
|
| 34 |
+
"vue.rearranger.settings.migration": "true"
|
| 35 |
+
}
|
| 36 |
+
}]]></component>
|
| 37 |
+
<component name="SharedIndexes">
|
| 38 |
+
<attachedChunks>
|
| 39 |
+
<set>
|
| 40 |
+
<option value="bundled-js-predefined-1d06a55b98c1-cb551a44b0f8-JavaScript-PY-242.10180.30" />
|
| 41 |
+
<option value="bundled-python-sdk-7efad6460ed6-db4a76ca2eac-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-242.10180.30" />
|
| 42 |
+
</set>
|
| 43 |
+
</attachedChunks>
|
| 44 |
+
</component>
|
| 45 |
+
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
|
| 46 |
+
<component name="TaskManager">
|
| 47 |
+
<task active="true" id="Default" summary="Default task">
|
| 48 |
+
<changelist id="1cdeceeb-6b27-4a56-9f44-8bb2d77c353f" name="Changes" comment="" />
|
| 49 |
+
<created>1717340527309</created>
|
| 50 |
+
<option name="number" value="Default" />
|
| 51 |
+
<option name="presentableId" value="Default" />
|
| 52 |
+
<updated>1717340527309</updated>
|
| 53 |
+
<workItem from="1717340528499" duration="17535000" />
|
| 54 |
+
</task>
|
| 55 |
+
<servers />
|
| 56 |
+
</component>
|
| 57 |
+
<component name="TypeScriptGeneratedFilesManager">
|
| 58 |
+
<option name="version" value="3" />
|
| 59 |
+
</component>
|
| 60 |
+
</project>
|
LICENSE
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Copyright (c) 2022 SenseTime. All Rights Reserved.
|
| 2 |
+
|
| 3 |
+
Apache License
|
| 4 |
+
Version 2.0, January 2004
|
| 5 |
+
http://www.apache.org/licenses/
|
| 6 |
+
|
| 7 |
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
| 8 |
+
|
| 9 |
+
1. Definitions.
|
| 10 |
+
|
| 11 |
+
"License" shall mean the terms and conditions for use, reproduction,
|
| 12 |
+
and distribution as defined by Sections 1 through 9 of this document.
|
| 13 |
+
|
| 14 |
+
"Licensor" shall mean the copyright owner or entity authorized by
|
| 15 |
+
the copyright owner that is granting the License.
|
| 16 |
+
|
| 17 |
+
"Legal Entity" shall mean the union of the acting entity and all
|
| 18 |
+
other entities that control, are controlled by, or are under common
|
| 19 |
+
control with that entity. For the purposes of this definition,
|
| 20 |
+
"control" means (i) the power, direct or indirect, to cause the
|
| 21 |
+
direction or management of such entity, whether by contract or
|
| 22 |
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
| 23 |
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
| 24 |
+
|
| 25 |
+
"You" (or "Your") shall mean an individual or Legal Entity
|
| 26 |
+
exercising permissions granted by this License.
|
| 27 |
+
|
| 28 |
+
"Source" form shall mean the preferred form for making modifications,
|
| 29 |
+
including but not limited to software source code, documentation
|
| 30 |
+
source, and configuration files.
|
| 31 |
+
|
| 32 |
+
"Object" form shall mean any form resulting from mechanical
|
| 33 |
+
transformation or translation of a Source form, including but
|
| 34 |
+
not limited to compiled object code, generated documentation,
|
| 35 |
+
and conversions to other media types.
|
| 36 |
+
|
| 37 |
+
"Work" shall mean the work of authorship, whether in Source or
|
| 38 |
+
Object form, made available under the License, as indicated by a
|
| 39 |
+
copyright notice that is included in or attached to the work
|
| 40 |
+
(an example is provided in the Appendix below).
|
| 41 |
+
|
| 42 |
+
"Derivative Works" shall mean any work, whether in Source or Object
|
| 43 |
+
form, that is based on (or derived from) the Work and for which the
|
| 44 |
+
editorial revisions, annotations, elaborations, or other modifications
|
| 45 |
+
represent, as a whole, an original work of authorship. For the purposes
|
| 46 |
+
of this License, Derivative Works shall not include works that remain
|
| 47 |
+
separable from, or merely link (or bind by name) to the interfaces of,
|
| 48 |
+
the Work and Derivative Works thereof.
|
| 49 |
+
|
| 50 |
+
"Contribution" shall mean any work of authorship, including
|
| 51 |
+
the original version of the Work and any modifications or additions
|
| 52 |
+
to that Work or Derivative Works thereof, that is intentionally
|
| 53 |
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
| 54 |
+
or by an individual or Legal Entity authorized to submit on behalf of
|
| 55 |
+
the copyright owner. For the purposes of this definition, "submitted"
|
| 56 |
+
means any form of electronic, verbal, or written communication sent
|
| 57 |
+
to the Licensor or its representatives, including but not limited to
|
| 58 |
+
communication on electronic mailing lists, source code control systems,
|
| 59 |
+
and issue tracking systems that are managed by, or on behalf of, the
|
| 60 |
+
Licensor for the purpose of discussing and improving the Work, but
|
| 61 |
+
excluding communication that is conspicuously marked or otherwise
|
| 62 |
+
designated in writing by the copyright owner as "Not a Contribution."
|
| 63 |
+
|
| 64 |
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
| 65 |
+
on behalf of whom a Contribution has been received by Licensor and
|
| 66 |
+
subsequently incorporated within the Work.
|
| 67 |
+
|
| 68 |
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
| 69 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 70 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 71 |
+
copyright license to reproduce, prepare Derivative Works of,
|
| 72 |
+
publicly display, publicly perform, sublicense, and distribute the
|
| 73 |
+
Work and such Derivative Works in Source or Object form.
|
| 74 |
+
|
| 75 |
+
3. Grant of Patent License. Subject to the terms and conditions of
|
| 76 |
+
this License, each Contributor hereby grants to You a perpetual,
|
| 77 |
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
| 78 |
+
(except as stated in this section) patent license to make, have made,
|
| 79 |
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
| 80 |
+
where such license applies only to those patent claims licensable
|
| 81 |
+
by such Contributor that are necessarily infringed by their
|
| 82 |
+
Contribution(s) alone or by combination of their Contribution(s)
|
| 83 |
+
with the Work to which such Contribution(s) was submitted. If You
|
| 84 |
+
institute patent litigation against any entity (including a
|
| 85 |
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
| 86 |
+
or a Contribution incorporated within the Work constitutes direct
|
| 87 |
+
or contributory patent infringement, then any patent licenses
|
| 88 |
+
granted to You under this License for that Work shall terminate
|
| 89 |
+
as of the date such litigation is filed.
|
| 90 |
+
|
| 91 |
+
4. Redistribution. You may reproduce and distribute copies of the
|
| 92 |
+
Work or Derivative Works thereof in any medium, with or without
|
| 93 |
+
modifications, and in Source or Object form, provided that You
|
| 94 |
+
meet the following conditions:
|
| 95 |
+
|
| 96 |
+
(a) You must give any other recipients of the Work or
|
| 97 |
+
Derivative Works a copy of this License; and
|
| 98 |
+
|
| 99 |
+
(b) You must cause any modified files to carry prominent notices
|
| 100 |
+
stating that You changed the files; and
|
| 101 |
+
|
| 102 |
+
(c) You must retain, in the Source form of any Derivative Works
|
| 103 |
+
that You distribute, all copyright, patent, trademark, and
|
| 104 |
+
attribution notices from the Source form of the Work,
|
| 105 |
+
excluding those notices that do not pertain to any part of
|
| 106 |
+
the Derivative Works; and
|
| 107 |
+
|
| 108 |
+
(d) If the Work includes a "NOTICE" text file as part of its
|
| 109 |
+
distribution, then any Derivative Works that You distribute must
|
| 110 |
+
include a readable copy of the attribution notices contained
|
| 111 |
+
within such NOTICE file, excluding those notices that do not
|
| 112 |
+
pertain to any part of the Derivative Works, in at least one
|
| 113 |
+
of the following places: within a NOTICE text file distributed
|
| 114 |
+
as part of the Derivative Works; within the Source form or
|
| 115 |
+
documentation, if provided along with the Derivative Works; or,
|
| 116 |
+
within a display generated by the Derivative Works, if and
|
| 117 |
+
wherever such third-party notices normally appear. The contents
|
| 118 |
+
of the NOTICE file are for informational purposes only and
|
| 119 |
+
do not modify the License. You may add Your own attribution
|
| 120 |
+
notices within Derivative Works that You distribute, alongside
|
| 121 |
+
or as an addendum to the NOTICE text from the Work, provided
|
| 122 |
+
that such additional attribution notices cannot be construed
|
| 123 |
+
as modifying the License.
|
| 124 |
+
|
| 125 |
+
You may add Your own copyright statement to Your modifications and
|
| 126 |
+
may provide additional or different license terms and conditions
|
| 127 |
+
for use, reproduction, or distribution of Your modifications, or
|
| 128 |
+
for any such Derivative Works as a whole, provided Your use,
|
| 129 |
+
reproduction, and distribution of the Work otherwise complies with
|
| 130 |
+
the conditions stated in this License.
|
| 131 |
+
|
| 132 |
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
| 133 |
+
any Contribution intentionally submitted for inclusion in the Work
|
| 134 |
+
by You to the Licensor shall be under the terms and conditions of
|
| 135 |
+
this License, without any additional terms or conditions.
|
| 136 |
+
Notwithstanding the above, nothing herein shall supersede or modify
|
| 137 |
+
the terms of any separate license agreement you may have executed
|
| 138 |
+
with Licensor regarding such Contributions.
|
| 139 |
+
|
| 140 |
+
6. Trademarks. This License does not grant permission to use the trade
|
| 141 |
+
names, trademarks, service marks, or product names of the Licensor,
|
| 142 |
+
except as required for reasonable and customary use in describing the
|
| 143 |
+
origin of the Work and reproducing the content of the NOTICE file.
|
| 144 |
+
|
| 145 |
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
| 146 |
+
agreed to in writing, Licensor provides the Work (and each
|
| 147 |
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
| 148 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
| 149 |
+
implied, including, without limitation, any warranties or conditions
|
| 150 |
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
| 151 |
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
| 152 |
+
appropriateness of using or redistributing the Work and assume any
|
| 153 |
+
risks associated with Your exercise of permissions under this License.
|
| 154 |
+
|
| 155 |
+
8. Limitation of Liability. In no event and under no legal theory,
|
| 156 |
+
whether in tort (including negligence), contract, or otherwise,
|
| 157 |
+
unless required by applicable law (such as deliberate and grossly
|
| 158 |
+
negligent acts) or agreed to in writing, shall any Contributor be
|
| 159 |
+
liable to You for damages, including any direct, indirect, special,
|
| 160 |
+
incidental, or consequential damages of any character arising as a
|
| 161 |
+
result of this License or out of the use or inability to use the
|
| 162 |
+
Work (including but not limited to damages for loss of goodwill,
|
| 163 |
+
work stoppage, computer failure or malfunction, or any and all
|
| 164 |
+
other commercial damages or losses), even if such Contributor
|
| 165 |
+
has been advised of the possibility of such damages.
|
| 166 |
+
|
| 167 |
+
9. Accepting Warranty or Additional Liability. While redistributing
|
| 168 |
+
the Work or Derivative Works thereof, You may choose to offer,
|
| 169 |
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
| 170 |
+
or other liability obligations and/or rights consistent with this
|
| 171 |
+
License. However, in accepting such obligations, You may act only
|
| 172 |
+
on Your own behalf and on Your sole responsibility, not on behalf
|
| 173 |
+
of any other Contributor, and only if You agree to indemnify,
|
| 174 |
+
defend, and hold each Contributor harmless for any liability
|
| 175 |
+
incurred by, or claims asserted against, such Contributor by reason
|
| 176 |
+
of your accepting any such warranty or additional liability.
|
| 177 |
+
|
| 178 |
+
END OF TERMS AND CONDITIONS
|
| 179 |
+
|
| 180 |
+
APPENDIX: How to apply the Apache License to your work.
|
| 181 |
+
|
| 182 |
+
To apply the Apache License to your work, attach the following
|
| 183 |
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
| 184 |
+
replaced with your own identifying information. (Don't include
|
| 185 |
+
the brackets!) The text should be enclosed in the appropriate
|
| 186 |
+
comment syntax for the file format. We also recommend that a
|
| 187 |
+
file or class name and description of purpose be included on the
|
| 188 |
+
same "printed page" as the copyright notice for easier
|
| 189 |
+
identification within third-party archives.
|
| 190 |
+
|
| 191 |
+
Copyright 2020 MMClassification Authors.
|
| 192 |
+
|
| 193 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 194 |
+
you may not use this file except in compliance with the License.
|
| 195 |
+
You may obtain a copy of the License at
|
| 196 |
+
|
| 197 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 198 |
+
|
| 199 |
+
Unless required by applicable law or agreed to in writing, software
|
| 200 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 201 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 202 |
+
See the License for the specific language governing permissions and
|
| 203 |
+
limitations under the License.
|
README.md
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: CapeX
|
| 3 |
+
emoji: 👁
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.36.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
python: 3.10.13
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spaces
|
| 2 |
+
import argparse
|
| 3 |
+
import random
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
os.system('python setup.py develop')
|
| 7 |
+
|
| 8 |
+
import gradio as gr
|
| 9 |
+
import matplotlib
|
| 10 |
+
import numpy as np
|
| 11 |
+
import torch
|
| 12 |
+
from PIL import ImageDraw, Image
|
| 13 |
+
from matplotlib import pyplot as plt
|
| 14 |
+
from mmcv import Config
|
| 15 |
+
import json
|
| 16 |
+
|
| 17 |
+
# def replace_line(file_name, line_num, text):
|
| 18 |
+
# lines = open(file_name, 'r').readlines()
|
| 19 |
+
# lines[line_num] = text
|
| 20 |
+
# out = open(file_name, 'w')
|
| 21 |
+
# out.writelines(lines)
|
| 22 |
+
# out.close()
|
| 23 |
+
|
| 24 |
+
# def read_lines(file_name):
|
| 25 |
+
# lines = open(file_name, 'r').readlines()
|
| 26 |
+
# print(lines)
|
| 27 |
+
|
| 28 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/parallel/distributed.py", 7, "from mmengine import print_log\n")
|
| 29 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/parallel/distributed.py", 8, "from mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmengine.utils import digit_version\n")
|
| 30 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/parallel/registry.py", 3, 'from mmengine.registry import Registry\n')
|
| 31 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/fileio/io.py", 5, "from mmengine.utils import is_list_of\n")
|
| 32 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/checkpoint.py", 23, "from mmengine.utils import digit_version, mkdir_or_exist\nfrom mmengine.utils.dl_utils import load_url\n")
|
| 33 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/hook.py", 1, "from mmengine.registry import Registry\nfrom mmengine.utils import is_method_overridden\n")
|
| 34 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/evaluation.py",11, "from mmengine.utils import is_seq_of\n")
|
| 35 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/mlflow.py", 3, "from mmengine.utils.dl_utils import TORCH_VERSION\n")
|
| 36 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/tensorboard.py", 4, "from mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmengine.utils import digit_version\n")
|
| 37 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/text.py", 12, "from mmengine.utils import is_tuple_of, scandir\n")
|
| 38 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/wandb.py", 5, "from mmengine.utils import scandir\n")
|
| 39 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/optimizer.py", 11, "from mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmcv.utils import IS_NPU_AVAILABLE\nfrom mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm\n")
|
| 40 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/optimizer.py", 14, "from mmengine.utils import digit_version\n")
|
| 41 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/fp16_utils.py", 12, "from mmcv.utils import IS_NPU_AVAILABLE\nfrom mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmengine.utils import digit_version\n")
|
| 42 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/builder.py", 4, "from mmengine.registry import Registry\n")
|
| 43 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/optimizer/builder.py", 7, "from mmcv.utils import IS_NPU_AVAILABLE\nfrom mmengine.registry import Registry, build_from_cfg\n")
|
| 44 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/optimizer/default_constructor.py", 8, "from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm\nfrom mmengine.registry import build_from_cfg\nfrom mmengine.utils import is_list_of\n")
|
| 45 |
+
|
| 46 |
+
# def is_ipu_available() -> bool:
|
| 47 |
+
# try:
|
| 48 |
+
# import poptorch
|
| 49 |
+
# return poptorch.ipuHardwareIsAvailable()
|
| 50 |
+
# except ImportError:
|
| 51 |
+
# return False
|
| 52 |
+
|
| 53 |
+
# IS_IPU_AVAILABLE = str(is_ipu_available())
|
| 54 |
+
|
| 55 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/device/ipu/__init__.py", 1, f'IS_IPU_AVAILABLE = {IS_IPU_AVAILABLE}\n')
|
| 56 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/device/scatter_gather.py", 4, "from mmengine.utils import deprecated_api_warning\n")
|
| 57 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/device/_functions.py", 5, "from mmengine.utils import deprecated_api_warning\n")
|
| 58 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmpose/__init__.py", 1, "from mmengine.utils import digit_version\nfrom mmcv import parse_version_info\n")
|
| 59 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmpose/__init__.py", 21, "import mmcv\nmmcv_version = digit_version(mmcv.__version__)\n")
|
| 60 |
+
# replace_line("/usr/local/lib/python3.10/site-packages/mmpose/core/optimizers/builder.py", 3, "from mmengine.registry import Registry, build_from_cfg")
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
from mmcv.runner import load_checkpoint
|
| 64 |
+
from mmpose.core import wrap_fp16_model
|
| 65 |
+
from mmpose.models import build_posenet
|
| 66 |
+
from torchvision import transforms
|
| 67 |
+
|
| 68 |
+
from demo_text import Resize_Pad
|
| 69 |
+
from models import *
|
| 70 |
+
|
| 71 |
+
import networkx as nx
|
| 72 |
+
import matplotlib.pyplot as plt
|
| 73 |
+
import ast
|
| 74 |
+
import cv2
|
| 75 |
+
|
| 76 |
+
import matplotlib
|
| 77 |
+
# matplotlib.use('agg')
|
| 78 |
+
|
| 79 |
+
def edges_prompt_to_list(prompt):
|
| 80 |
+
if prompt[0] != "[":
|
| 81 |
+
prompt = "[" + prompt
|
| 82 |
+
if prompt[-1] != "]":
|
| 83 |
+
prompt += "]"
|
| 84 |
+
return ast.literal_eval(prompt)
|
| 85 |
+
|
| 86 |
+
def descriptions_prompt_to_list(prompt):
|
| 87 |
+
return prompt.split(',')
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# Function to visualize the graph
|
| 91 |
+
def visualize_graph(node_descriptions, edges):
|
| 92 |
+
plt.close('all')
|
| 93 |
+
node_descriptions = descriptions_prompt_to_list(node_descriptions)
|
| 94 |
+
edges = edges_prompt_to_list(edges)
|
| 95 |
+
|
| 96 |
+
# Create an empty graph
|
| 97 |
+
G = nx.Graph()
|
| 98 |
+
G.clear()
|
| 99 |
+
|
| 100 |
+
# Add nodes with descriptions
|
| 101 |
+
for i, desc in enumerate(node_descriptions):
|
| 102 |
+
G.add_node(i, label=desc)
|
| 103 |
+
|
| 104 |
+
# Add edges
|
| 105 |
+
for edge in edges:
|
| 106 |
+
G.add_edge(edge[0], edge[1])
|
| 107 |
+
|
| 108 |
+
# Draw the graph
|
| 109 |
+
pos = nx.spring_layout(G) # Define layout
|
| 110 |
+
labels = nx.get_node_attributes(G, 'label') # Get labels
|
| 111 |
+
nx.draw(G, pos, with_labels=True, labels=labels, node_size=1500, node_color='skyblue', font_size=10, font_weight='bold', font_color='black') # Draw nodes with labels
|
| 112 |
+
nx.draw_networkx_edges(G, pos, width=2, edge_color='gray') # Draw edges
|
| 113 |
+
plt.title("Graph Visualization") # Set title
|
| 114 |
+
plt.axis('off') # Turn off axis
|
| 115 |
+
# plt.show() # Show plot
|
| 116 |
+
# Image from plot
|
| 117 |
+
fig = plt.gcf()
|
| 118 |
+
# fig.tight_layout(pad=0)
|
| 119 |
+
|
| 120 |
+
# To remove the huge white borders
|
| 121 |
+
# plt.margins(0)
|
| 122 |
+
|
| 123 |
+
fig.canvas.draw()
|
| 124 |
+
image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
| 125 |
+
image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
| 126 |
+
plt.clf()
|
| 127 |
+
return image_from_plot
|
| 128 |
+
|
| 129 |
+
checkpoint_path = ''
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def plot_query_results(query_img, query_w, skeleton, prediction, radius=6):
|
| 134 |
+
h, w, c = query_img.shape
|
| 135 |
+
prediction = prediction[-1].cpu().numpy() * h
|
| 136 |
+
# prediction = prediction.cpu().numpy() * h
|
| 137 |
+
query_img = (query_img - np.min(query_img)) / (
|
| 138 |
+
np.max(query_img) - np.min(query_img))
|
| 139 |
+
for id, (img, w, keypoint) in enumerate(zip([query_img],
|
| 140 |
+
[query_w],
|
| 141 |
+
[prediction])):
|
| 142 |
+
f, axes = plt.subplots()
|
| 143 |
+
plt.imshow(img)
|
| 144 |
+
for k in range(keypoint.shape[0]):
|
| 145 |
+
if w[k] > 0:
|
| 146 |
+
kp = keypoint[k, :2]
|
| 147 |
+
c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6)
|
| 148 |
+
patch = plt.Circle(kp, radius, color=c)
|
| 149 |
+
axes.add_patch(patch)
|
| 150 |
+
axes.text(kp[0], kp[1], k)
|
| 151 |
+
plt.draw()
|
| 152 |
+
for l, limb in enumerate(skeleton):
|
| 153 |
+
kp = keypoint[:, :2]
|
| 154 |
+
if l > len(COLORS) - 1:
|
| 155 |
+
c = [x / 255 for x in random.sample(range(0, 255), 3)]
|
| 156 |
+
else:
|
| 157 |
+
c = [x / 255 for x in COLORS[l]]
|
| 158 |
+
if w[limb[0]] > 0 and w[limb[1]] > 0:
|
| 159 |
+
patch = plt.Line2D([kp[limb[0], 0], kp[limb[1], 0]],
|
| 160 |
+
[kp[limb[0], 1], kp[limb[1], 1]],
|
| 161 |
+
linewidth=6, color=c, alpha=0.6)
|
| 162 |
+
axes.add_artist(patch)
|
| 163 |
+
plt.axis('off') # command for hiding the axis.
|
| 164 |
+
plt.subplots_adjust(0, 0, 1, 1, 0, 0)
|
| 165 |
+
plt.margins(0)
|
| 166 |
+
fig = plt.gcf()
|
| 167 |
+
fig.tight_layout(pad=0)
|
| 168 |
+
|
| 169 |
+
return plt
|
| 170 |
+
|
| 171 |
+
COLORS = [
|
| 172 |
+
[255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
|
| 173 |
+
[85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
|
| 174 |
+
[0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
|
| 175 |
+
[255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]
|
| 176 |
+
]
|
| 177 |
+
|
| 178 |
+
model = None
|
| 179 |
+
|
| 180 |
+
# @spaces.GPU(duration=30)
|
| 181 |
+
# def estimate(model, data):
|
| 182 |
+
# with torch.no_grad():
|
| 183 |
+
# model_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 184 |
+
# data["img_q"].to(device=model_device)
|
| 185 |
+
# data['target_weight_s'][0].to(device=model_device)
|
| 186 |
+
# print(f'img type: {data["img_q"].dtype}, target_weight type: {data["target_weight_s"][0].dtype}')
|
| 187 |
+
# model.to(model_device)
|
| 188 |
+
# model.eval()
|
| 189 |
+
# # return model(**data)
|
| 190 |
+
# return model(str(data))
|
| 191 |
+
|
| 192 |
+
# @spaces.GPU(duration=30)
|
| 193 |
+
def estimate(data):
|
| 194 |
+
global model
|
| 195 |
+
with torch.no_grad():
|
| 196 |
+
# model_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 197 |
+
# data["img_q"].to(device=model_device)
|
| 198 |
+
# data['target_weight_s'][0].to(device=model_device)
|
| 199 |
+
|
| 200 |
+
return model(data)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
# Custom JSON encoder to handle non-serializable objects
|
| 204 |
+
class CustomEncoder(json.JSONEncoder):
|
| 205 |
+
def default(self, obj):
|
| 206 |
+
if isinstance(obj, np.ndarray):
|
| 207 |
+
return obj.tolist()
|
| 208 |
+
return super().default(obj)
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def process(query_img, node_descriptions, edges,
|
| 212 |
+
cfg_path='configs/1shot-swin-gte/graph_split1_config.py'):
|
| 213 |
+
global model
|
| 214 |
+
node_descriptions = descriptions_prompt_to_list(node_descriptions)
|
| 215 |
+
edges = edges_prompt_to_list(edges)
|
| 216 |
+
cfg = Config.fromfile(cfg_path)
|
| 217 |
+
kp_src_tensor = torch.zeros((len(node_descriptions), 2))
|
| 218 |
+
preprocess = transforms.Compose([
|
| 219 |
+
transforms.ToTensor(),
|
| 220 |
+
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
|
| 221 |
+
Resize_Pad(cfg.model.encoder_config.img_size,
|
| 222 |
+
cfg.model.encoder_config.img_size)])
|
| 223 |
+
|
| 224 |
+
if len(edges) == 0:
|
| 225 |
+
edges = [(0, 0)]
|
| 226 |
+
|
| 227 |
+
#model_device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 228 |
+
|
| 229 |
+
np_query = np.array(query_img)[:, :, ::-1].copy()
|
| 230 |
+
q_img = preprocess(np_query).flip(0)[None] #.to(model_device)
|
| 231 |
+
# Create heatmap from keypoints
|
| 232 |
+
genHeatMap = TopDownGenerateTargetFewShot()
|
| 233 |
+
data_cfg = cfg.data_cfg
|
| 234 |
+
data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size,
|
| 235 |
+
cfg.model.encoder_config.img_size])
|
| 236 |
+
data_cfg['joint_weights'] = None
|
| 237 |
+
data_cfg['use_different_joint_weights'] = False
|
| 238 |
+
kp_src_3d = torch.cat(
|
| 239 |
+
(kp_src_tensor, torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
|
| 240 |
+
kp_src_3d_weight = torch.cat(
|
| 241 |
+
(torch.ones_like(kp_src_tensor),
|
| 242 |
+
torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
|
| 243 |
+
target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg,
|
| 244 |
+
kp_src_3d,
|
| 245 |
+
kp_src_3d_weight,
|
| 246 |
+
sigma=1)
|
| 247 |
+
target_s = torch.tensor(target_s).float()[None]
|
| 248 |
+
target_weight_s = torch.ones_like(
|
| 249 |
+
torch.tensor(target_weight_s).float()[None]) #.to(model_device)
|
| 250 |
+
|
| 251 |
+
data = {
|
| 252 |
+
'img_s': [0],
|
| 253 |
+
'img_q': q_img,
|
| 254 |
+
'target_s': [target_s],
|
| 255 |
+
'target_weight_s': [target_weight_s],
|
| 256 |
+
'target_q': None,
|
| 257 |
+
'target_weight_q': None,
|
| 258 |
+
'return_loss': False,
|
| 259 |
+
'img_metas': [{'sample_skeleton': [edges],
|
| 260 |
+
'query_skeleton': edges,
|
| 261 |
+
# 'sample_point_descriptions': np.array([node_descriptions]),
|
| 262 |
+
'sample_point_descriptions': node_descriptions,
|
| 263 |
+
'sample_joints_3d': [kp_src_3d],
|
| 264 |
+
'query_joints_3d': kp_src_3d,
|
| 265 |
+
'sample_center': [kp_src_tensor.mean(dim=0)],
|
| 266 |
+
'query_center': kp_src_tensor.mean(dim=0),
|
| 267 |
+
'sample_scale': [
|
| 268 |
+
kp_src_tensor.max(dim=0)[0] -
|
| 269 |
+
kp_src_tensor.min(dim=0)[0]],
|
| 270 |
+
'query_scale': kp_src_tensor.max(dim=0)[0] -
|
| 271 |
+
kp_src_tensor.min(dim=0)[0],
|
| 272 |
+
'sample_rotation': [0],
|
| 273 |
+
'query_rotation': 0,
|
| 274 |
+
'sample_bbox_score': [1],
|
| 275 |
+
'query_bbox_score': 1,
|
| 276 |
+
'query_image_file': '',
|
| 277 |
+
'sample_image_file': [''],
|
| 278 |
+
}]
|
| 279 |
+
}
|
| 280 |
+
# Load model
|
| 281 |
+
model = build_posenet(cfg.model)
|
| 282 |
+
fp16_cfg = cfg.get('fp16', None)
|
| 283 |
+
if fp16_cfg is not None:
|
| 284 |
+
wrap_fp16_model(model)
|
| 285 |
+
load_checkpoint(model, checkpoint_path, map_location='cpu')
|
| 286 |
+
#model.to(model_device)
|
| 287 |
+
#model.eval()
|
| 288 |
+
|
| 289 |
+
# with torch.no_grad():
|
| 290 |
+
# outputs = model(**data)
|
| 291 |
+
|
| 292 |
+
data["img_q"] = data["img_q"].cpu().numpy().tolist()
|
| 293 |
+
data['target_weight_s'][0] = data['target_weight_s'][0].cpu().numpy().tolist()
|
| 294 |
+
data['target_s'][0] = data['target_s'][0].cpu().numpy().tolist()
|
| 295 |
+
|
| 296 |
+
data['img_metas'][0]['sample_joints_3d'][0] = data['img_metas'][0]['sample_joints_3d'][0].cpu().tolist()
|
| 297 |
+
data['img_metas'][0]['query_joints_3d'] = data['img_metas'][0]['query_joints_3d'].cpu().tolist()
|
| 298 |
+
data['img_metas'][0]['sample_center'][0] = data['img_metas'][0]['sample_center'][0].cpu().tolist()
|
| 299 |
+
data['img_metas'][0]['query_center'] = data['img_metas'][0]['query_center'].cpu().tolist()
|
| 300 |
+
data['img_metas'][0]['sample_scale'][0] = data['img_metas'][0]['sample_scale'][0].cpu().tolist()
|
| 301 |
+
data['img_metas'][0]['query_scale'] = data['img_metas'][0]['query_scale'].cpu().tolist()
|
| 302 |
+
|
| 303 |
+
# # data['img_metas'][0]['sample_point_descriptions'] = data['img_metas'][0]['sample_point_descriptions'].tolist()
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
#model.cuda()
|
| 307 |
+
model.eval()
|
| 308 |
+
# return model(**data)
|
| 309 |
+
# with torch.no_grad():
|
| 310 |
+
# outputs = model(**data)
|
| 311 |
+
str_data = json.dumps(data, cls=CustomEncoder)
|
| 312 |
+
|
| 313 |
+
outputs = estimate(str_data)
|
| 314 |
+
#outputs = estimate(**data)
|
| 315 |
+
|
| 316 |
+
# visualize results
|
| 317 |
+
vis_q_weight = target_weight_s[0]
|
| 318 |
+
vis_q_image = q_img[0].detach().cpu().numpy().transpose(1, 2, 0)
|
| 319 |
+
|
| 320 |
+
out = plot_query_results(vis_q_image, vis_q_weight, edges, torch.tensor(outputs['points']).squeeze(0))
|
| 321 |
+
return out
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
def update_examples(query_img, node_descriptions, edges):
|
| 325 |
+
|
| 326 |
+
return query_img, node_descriptions, edges
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
with gr.Blocks() as demo:
|
| 330 |
+
state = gr.State({
|
| 331 |
+
'kp_src': [],
|
| 332 |
+
'skeleton': [],
|
| 333 |
+
'count': 0,
|
| 334 |
+
'color_idx': 0,
|
| 335 |
+
'prev_pt': None,
|
| 336 |
+
'prev_pt_idx': None,
|
| 337 |
+
'prev_clicked': None,
|
| 338 |
+
'point_descriptions': None,
|
| 339 |
+
})
|
| 340 |
+
gr.Markdown('''
|
| 341 |
+
# CapeX Demo
|
| 342 |
+
We present a novel category agnostic pose estimation approach that utilizes support text-graphs
|
| 343 |
+
(graphs with text on its nodes), instead of the conventional techniques that use support images.
|
| 344 |
+
By leveraging the abstraction power of text-graphs, CapeX showcases SOTA results on MP100 while dropping the need
|
| 345 |
+
of providing an annotated support image.
|
| 346 |
+
### [Paper](https://arxiv.org/pdf/2406.00384) | [GitHub](https://github.com/matanr/capex)
|
| 347 |
+
## Instructions
|
| 348 |
+
1. Explain using text the desired keypoints. Pleaser refer to the example for the right format.
|
| 349 |
+
2. Optionally provide a graph representing the connections between the keypoints. Pleaser refer to the example for the right format.
|
| 350 |
+
3. Upload an image of the object you want to pose to the query image.
|
| 351 |
+
4. Click **Evaluate** to pose the query image.
|
| 352 |
+
''')
|
| 353 |
+
with gr.Row():
|
| 354 |
+
# Input block for node descriptions
|
| 355 |
+
node_descriptions = gr.Textbox(label="Node Descriptions (String separated by commas)", lines=5, type="text",
|
| 356 |
+
# value="left eye, right eye, nose, neck, root of tail, left shoulder, left elbow, "
|
| 357 |
+
# "left front paw, right shoulder, right elbow, right front paw, left hip, "
|
| 358 |
+
# "left knee, left back paw, right hip, right knee, right back paw"
|
| 359 |
+
value="left eye, nose, right eye"
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
# Input block for edges
|
| 363 |
+
edges = gr.Textbox(label="Edges (List of 2-valued lists representing connections)", lines=5, type="text",
|
| 364 |
+
# value="[[0, 1], [0, 2], [1, 2], [2, 3], [3, 4], [3, 5], [5, 6], [6, 7], [3, 8], "
|
| 365 |
+
# "[8, 9], [9, 10], [4, 11], [11, 12], [12, 13], [4, 14], [14, 15], [15, 16]]"
|
| 366 |
+
value="[[0,1], [1,2]]"
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
def set_initial_text_graph():
|
| 370 |
+
text_graph = visualize_graph("left eye, nose, right eye", "[[0,1], [1,2]]")
|
| 371 |
+
return text_graph
|
| 372 |
+
|
| 373 |
+
text_graph = gr.Image(label="Text-graph visualization",
|
| 374 |
+
value=set_initial_text_graph,
|
| 375 |
+
type="pil", height=400, width=400)
|
| 376 |
+
|
| 377 |
+
with gr.Row():
|
| 378 |
+
query_img = gr.Image(label="Query Image",
|
| 379 |
+
type="pil", height=400, width=400)
|
| 380 |
+
with gr.Row():
|
| 381 |
+
eval_btn = gr.Button(value="Evaluate")
|
| 382 |
+
with gr.Row():
|
| 383 |
+
output_img = gr.Plot(label="Output Image")
|
| 384 |
+
with gr.Row():
|
| 385 |
+
gr.Markdown("## Examples")
|
| 386 |
+
with gr.Row():
|
| 387 |
+
gr.Examples(
|
| 388 |
+
examples=[
|
| 389 |
+
['examples/animal.png',
|
| 390 |
+
"left eye, right eye, nose, neck, root of tail, left shoulder, left elbow, "
|
| 391 |
+
"left front paw, right shoulder, right elbow, right front paw, left hip, "
|
| 392 |
+
"left knee, left back paw, right hip, right knee, right back paw",
|
| 393 |
+
"[[0, 1], [0, 2], [1, 2], [2, 3], [3, 4], [3, 5], [5, 6], [6, 7], [3, 8], [8, 9],"
|
| 394 |
+
"[9, 10], [4, 11], [11, 12], [12, 13], [4, 14], [14, 15], [15, 16]]"
|
| 395 |
+
],
|
| 396 |
+
['examples/person.png',
|
| 397 |
+
"nose, left eye, right eye, left ear, right ear, left shoulder, right shoulder, left elbow, "
|
| 398 |
+
"right elbow, left wrist, right wrist, left hip, right hip, left knee, right knee, left ankle, "
|
| 399 |
+
"right ankle",
|
| 400 |
+
"[[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7],"
|
| 401 |
+
"[6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]]"
|
| 402 |
+
],
|
| 403 |
+
['examples/chair.png',
|
| 404 |
+
"left and front leg, right and front leg, right and back leg, left and back leg, "
|
| 405 |
+
"left and front side of the seat, right and front side of the seat, right and back side of the seat, "
|
| 406 |
+
"left and back side of the seat, top left side of the backseat, top right side of the backseat",
|
| 407 |
+
"[[0, 4], [3, 7], [1, 5], [2, 6], [4, 5], [5, 6], [6, 7], [7, 4], [6, 7], [7, 8],[8, 9], [9, 6]]",
|
| 408 |
+
],
|
| 409 |
+
['examples/car.png',
|
| 410 |
+
"front and right wheel, front and left wheel, rear and right wheel, rear and left wheel, "
|
| 411 |
+
"right headlight, left headlight, right taillight, left taillight, "
|
| 412 |
+
"front and right side of the top, front and left side of the top, rear and right side of the top, "
|
| 413 |
+
"rear and left side of the top",
|
| 414 |
+
"[[0, 2], [1, 3], [0, 1], [2, 3], [8, 10], [9, 11], [8, 9], [10, 11], [4, 0], "
|
| 415 |
+
"[4, 8], [4, 5], [5, 1], [5, 9], [6, 2], [6, 10], [7, 3], [7, 11], [6, 7]]"
|
| 416 |
+
]
|
| 417 |
+
],
|
| 418 |
+
inputs=[query_img, node_descriptions, edges],
|
| 419 |
+
outputs=[query_img, node_descriptions, edges],
|
| 420 |
+
fn=update_examples,
|
| 421 |
+
run_on_click=True,
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
eval_btn.click(fn=process,
|
| 425 |
+
inputs=[query_img, node_descriptions, edges],
|
| 426 |
+
outputs=[output_img])
|
| 427 |
+
|
| 428 |
+
node_descriptions.change(visualize_graph, inputs=[node_descriptions, edges], outputs=[text_graph])
|
| 429 |
+
edges.input(visualize_graph, inputs=[node_descriptions, edges], outputs=[text_graph])
|
| 430 |
+
|
| 431 |
+
# visualize_button.click(fn=visualize_graph,
|
| 432 |
+
# inputs=[node_descriptions, edges, state],
|
| 433 |
+
# outputs=[text_graph, state])
|
| 434 |
+
|
| 435 |
+
if __name__ == "__main__":
|
| 436 |
+
parser = argparse.ArgumentParser(description='CapeX Demo')
|
| 437 |
+
parser.add_argument('--checkpoint',
|
| 438 |
+
help='checkpoint path',
|
| 439 |
+
default='swin-gte-split1.pth')
|
| 440 |
+
args = parser.parse_args()
|
| 441 |
+
checkpoint_path = args.checkpoint
|
| 442 |
+
demo.launch()
|
configs/1shot-swin-clip/base_split1_config.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained="ViT-B/32",
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=512,
|
| 67 |
+
transformer=dict(
|
| 68 |
+
type='EncoderDecoder',
|
| 69 |
+
d_model=256,
|
| 70 |
+
nhead=8,
|
| 71 |
+
num_encoder_layers=3,
|
| 72 |
+
num_decoder_layers=3,
|
| 73 |
+
dim_feedforward=768,
|
| 74 |
+
dropout=0.1,
|
| 75 |
+
similarity_proj_dim=256,
|
| 76 |
+
dynamic_proj_dim=128,
|
| 77 |
+
activation="relu",
|
| 78 |
+
normalize_before=False,
|
| 79 |
+
return_intermediate_dec=True),
|
| 80 |
+
share_kpt_branch=False,
|
| 81 |
+
num_decoder_layer=3,
|
| 82 |
+
with_heatmap_loss=True,
|
| 83 |
+
|
| 84 |
+
heatmap_loss_weight=2.0,
|
| 85 |
+
support_order_dropout=-1,
|
| 86 |
+
positional_encoding=dict(
|
| 87 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 88 |
+
# training and testing settings
|
| 89 |
+
train_cfg=dict(),
|
| 90 |
+
test_cfg=dict(
|
| 91 |
+
flip_test=False,
|
| 92 |
+
post_process='default',
|
| 93 |
+
shift_heatmap=True,
|
| 94 |
+
modulate_kernel=11))
|
| 95 |
+
|
| 96 |
+
data_cfg = dict(
|
| 97 |
+
image_size=[256, 256],
|
| 98 |
+
heatmap_size=[64, 64],
|
| 99 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 100 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 101 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 102 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 103 |
+
|
| 104 |
+
train_pipeline = [
|
| 105 |
+
dict(type='LoadImageFromFile'),
|
| 106 |
+
dict(
|
| 107 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 108 |
+
scale_factor=0.15),
|
| 109 |
+
dict(type='TopDownAffineFewShot'),
|
| 110 |
+
dict(type='ToTensor'),
|
| 111 |
+
dict(
|
| 112 |
+
type='NormalizeTensor',
|
| 113 |
+
mean=[0.485, 0.456, 0.406],
|
| 114 |
+
std=[0.229, 0.224, 0.225]),
|
| 115 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 116 |
+
dict(
|
| 117 |
+
type='Collect',
|
| 118 |
+
keys=['img', 'target', 'target_weight'],
|
| 119 |
+
meta_keys=[
|
| 120 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 121 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 122 |
+
]),
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
valid_pipeline = [
|
| 126 |
+
dict(type='LoadImageFromFile'),
|
| 127 |
+
dict(type='TopDownAffineFewShot'),
|
| 128 |
+
dict(type='ToTensor'),
|
| 129 |
+
dict(
|
| 130 |
+
type='NormalizeTensor',
|
| 131 |
+
mean=[0.485, 0.456, 0.406],
|
| 132 |
+
std=[0.229, 0.224, 0.225]),
|
| 133 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 134 |
+
dict(
|
| 135 |
+
type='Collect',
|
| 136 |
+
keys=['img', 'target', 'target_weight'],
|
| 137 |
+
meta_keys=[
|
| 138 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 139 |
+
'flip_pairs', 'category_id',
|
| 140 |
+
'skeleton',
|
| 141 |
+
]),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
test_pipeline = valid_pipeline
|
| 145 |
+
|
| 146 |
+
data_root = 'data/mp100'
|
| 147 |
+
data = dict(
|
| 148 |
+
samples_per_gpu=16,
|
| 149 |
+
workers_per_gpu=16,
|
| 150 |
+
# samples_per_gpu=8,
|
| 151 |
+
# workers_per_gpu=8,
|
| 152 |
+
train=dict(
|
| 153 |
+
type='TransformerPoseDataset',
|
| 154 |
+
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
|
| 155 |
+
img_prefix=f'{data_root}/images/',
|
| 156 |
+
# img_prefix=f'{data_root}',
|
| 157 |
+
data_cfg=data_cfg,
|
| 158 |
+
valid_class_ids=None,
|
| 159 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 160 |
+
num_shots=1,
|
| 161 |
+
pipeline=train_pipeline),
|
| 162 |
+
val=dict(
|
| 163 |
+
type='TransformerPoseDataset',
|
| 164 |
+
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
|
| 165 |
+
img_prefix=f'{data_root}/images/',
|
| 166 |
+
# img_prefix=f'{data_root}',
|
| 167 |
+
data_cfg=data_cfg,
|
| 168 |
+
valid_class_ids=None,
|
| 169 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 170 |
+
num_shots=1,
|
| 171 |
+
num_queries=15,
|
| 172 |
+
num_episodes=100,
|
| 173 |
+
pipeline=valid_pipeline),
|
| 174 |
+
test=dict(
|
| 175 |
+
type='TestPoseDataset',
|
| 176 |
+
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
|
| 177 |
+
img_prefix=f'{data_root}/images/',
|
| 178 |
+
# img_prefix=f'{data_root}',
|
| 179 |
+
data_cfg=data_cfg,
|
| 180 |
+
valid_class_ids=None,
|
| 181 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 182 |
+
num_shots=1,
|
| 183 |
+
num_queries=15,
|
| 184 |
+
num_episodes=200,
|
| 185 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 186 |
+
pipeline=test_pipeline),
|
| 187 |
+
)
|
| 188 |
+
vis_backends = [
|
| 189 |
+
dict(type='LocalVisBackend'),
|
| 190 |
+
dict(type='TensorboardVisBackend'),
|
| 191 |
+
]
|
| 192 |
+
visualizer = dict(
|
| 193 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 194 |
+
|
| 195 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-clip/base_split2_config.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained="ViT-B/32",
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=512,
|
| 67 |
+
transformer=dict(
|
| 68 |
+
type='EncoderDecoder',
|
| 69 |
+
d_model=256,
|
| 70 |
+
nhead=8,
|
| 71 |
+
num_encoder_layers=3,
|
| 72 |
+
num_decoder_layers=3,
|
| 73 |
+
dim_feedforward=768,
|
| 74 |
+
dropout=0.1,
|
| 75 |
+
similarity_proj_dim=256,
|
| 76 |
+
dynamic_proj_dim=128,
|
| 77 |
+
activation="relu",
|
| 78 |
+
normalize_before=False,
|
| 79 |
+
return_intermediate_dec=True),
|
| 80 |
+
share_kpt_branch=False,
|
| 81 |
+
num_decoder_layer=3,
|
| 82 |
+
with_heatmap_loss=True,
|
| 83 |
+
|
| 84 |
+
heatmap_loss_weight=2.0,
|
| 85 |
+
support_order_dropout=-1,
|
| 86 |
+
positional_encoding=dict(
|
| 87 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 88 |
+
# training and testing settings
|
| 89 |
+
train_cfg=dict(),
|
| 90 |
+
test_cfg=dict(
|
| 91 |
+
flip_test=False,
|
| 92 |
+
post_process='default',
|
| 93 |
+
shift_heatmap=True,
|
| 94 |
+
modulate_kernel=11))
|
| 95 |
+
|
| 96 |
+
data_cfg = dict(
|
| 97 |
+
image_size=[256, 256],
|
| 98 |
+
heatmap_size=[64, 64],
|
| 99 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 100 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 101 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 102 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 103 |
+
|
| 104 |
+
train_pipeline = [
|
| 105 |
+
dict(type='LoadImageFromFile'),
|
| 106 |
+
dict(
|
| 107 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 108 |
+
scale_factor=0.15),
|
| 109 |
+
dict(type='TopDownAffineFewShot'),
|
| 110 |
+
dict(type='ToTensor'),
|
| 111 |
+
dict(
|
| 112 |
+
type='NormalizeTensor',
|
| 113 |
+
mean=[0.485, 0.456, 0.406],
|
| 114 |
+
std=[0.229, 0.224, 0.225]),
|
| 115 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 116 |
+
dict(
|
| 117 |
+
type='Collect',
|
| 118 |
+
keys=['img', 'target', 'target_weight'],
|
| 119 |
+
meta_keys=[
|
| 120 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 121 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 122 |
+
]),
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
valid_pipeline = [
|
| 126 |
+
dict(type='LoadImageFromFile'),
|
| 127 |
+
dict(type='TopDownAffineFewShot'),
|
| 128 |
+
dict(type='ToTensor'),
|
| 129 |
+
dict(
|
| 130 |
+
type='NormalizeTensor',
|
| 131 |
+
mean=[0.485, 0.456, 0.406],
|
| 132 |
+
std=[0.229, 0.224, 0.225]),
|
| 133 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 134 |
+
dict(
|
| 135 |
+
type='Collect',
|
| 136 |
+
keys=['img', 'target', 'target_weight'],
|
| 137 |
+
meta_keys=[
|
| 138 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 139 |
+
'flip_pairs', 'category_id',
|
| 140 |
+
'skeleton',
|
| 141 |
+
]),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
test_pipeline = valid_pipeline
|
| 145 |
+
|
| 146 |
+
data_root = 'data/mp100'
|
| 147 |
+
data = dict(
|
| 148 |
+
samples_per_gpu=16,
|
| 149 |
+
workers_per_gpu=16,
|
| 150 |
+
# samples_per_gpu=8,
|
| 151 |
+
# workers_per_gpu=8,
|
| 152 |
+
train=dict(
|
| 153 |
+
type='TransformerPoseDataset',
|
| 154 |
+
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
|
| 155 |
+
img_prefix=f'{data_root}/images/',
|
| 156 |
+
# img_prefix=f'{data_root}',
|
| 157 |
+
data_cfg=data_cfg,
|
| 158 |
+
valid_class_ids=None,
|
| 159 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 160 |
+
num_shots=1,
|
| 161 |
+
pipeline=train_pipeline),
|
| 162 |
+
val=dict(
|
| 163 |
+
type='TransformerPoseDataset',
|
| 164 |
+
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
|
| 165 |
+
img_prefix=f'{data_root}/images/',
|
| 166 |
+
# img_prefix=f'{data_root}',
|
| 167 |
+
data_cfg=data_cfg,
|
| 168 |
+
valid_class_ids=None,
|
| 169 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 170 |
+
num_shots=1,
|
| 171 |
+
num_queries=15,
|
| 172 |
+
num_episodes=100,
|
| 173 |
+
pipeline=valid_pipeline),
|
| 174 |
+
test=dict(
|
| 175 |
+
type='TestPoseDataset',
|
| 176 |
+
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
|
| 177 |
+
img_prefix=f'{data_root}/images/',
|
| 178 |
+
# img_prefix=f'{data_root}',
|
| 179 |
+
data_cfg=data_cfg,
|
| 180 |
+
valid_class_ids=None,
|
| 181 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 182 |
+
num_shots=1,
|
| 183 |
+
num_queries=15,
|
| 184 |
+
num_episodes=200,
|
| 185 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 186 |
+
pipeline=test_pipeline),
|
| 187 |
+
)
|
| 188 |
+
vis_backends = [
|
| 189 |
+
dict(type='LocalVisBackend'),
|
| 190 |
+
dict(type='TensorboardVisBackend'),
|
| 191 |
+
]
|
| 192 |
+
visualizer = dict(
|
| 193 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 194 |
+
|
| 195 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-clip/base_split3_config.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained="ViT-B/32",
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=512,
|
| 67 |
+
transformer=dict(
|
| 68 |
+
type='EncoderDecoder',
|
| 69 |
+
d_model=256,
|
| 70 |
+
nhead=8,
|
| 71 |
+
num_encoder_layers=3,
|
| 72 |
+
num_decoder_layers=3,
|
| 73 |
+
dim_feedforward=768,
|
| 74 |
+
dropout=0.1,
|
| 75 |
+
similarity_proj_dim=256,
|
| 76 |
+
dynamic_proj_dim=128,
|
| 77 |
+
activation="relu",
|
| 78 |
+
normalize_before=False,
|
| 79 |
+
return_intermediate_dec=True),
|
| 80 |
+
share_kpt_branch=False,
|
| 81 |
+
num_decoder_layer=3,
|
| 82 |
+
with_heatmap_loss=True,
|
| 83 |
+
|
| 84 |
+
heatmap_loss_weight=2.0,
|
| 85 |
+
support_order_dropout=-1,
|
| 86 |
+
positional_encoding=dict(
|
| 87 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 88 |
+
# training and testing settings
|
| 89 |
+
train_cfg=dict(),
|
| 90 |
+
test_cfg=dict(
|
| 91 |
+
flip_test=False,
|
| 92 |
+
post_process='default',
|
| 93 |
+
shift_heatmap=True,
|
| 94 |
+
modulate_kernel=11))
|
| 95 |
+
|
| 96 |
+
data_cfg = dict(
|
| 97 |
+
image_size=[256, 256],
|
| 98 |
+
heatmap_size=[64, 64],
|
| 99 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 100 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 101 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 102 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 103 |
+
|
| 104 |
+
train_pipeline = [
|
| 105 |
+
dict(type='LoadImageFromFile'),
|
| 106 |
+
dict(
|
| 107 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 108 |
+
scale_factor=0.15),
|
| 109 |
+
dict(type='TopDownAffineFewShot'),
|
| 110 |
+
dict(type='ToTensor'),
|
| 111 |
+
dict(
|
| 112 |
+
type='NormalizeTensor',
|
| 113 |
+
mean=[0.485, 0.456, 0.406],
|
| 114 |
+
std=[0.229, 0.224, 0.225]),
|
| 115 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 116 |
+
dict(
|
| 117 |
+
type='Collect',
|
| 118 |
+
keys=['img', 'target', 'target_weight'],
|
| 119 |
+
meta_keys=[
|
| 120 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 121 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 122 |
+
]),
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
valid_pipeline = [
|
| 126 |
+
dict(type='LoadImageFromFile'),
|
| 127 |
+
dict(type='TopDownAffineFewShot'),
|
| 128 |
+
dict(type='ToTensor'),
|
| 129 |
+
dict(
|
| 130 |
+
type='NormalizeTensor',
|
| 131 |
+
mean=[0.485, 0.456, 0.406],
|
| 132 |
+
std=[0.229, 0.224, 0.225]),
|
| 133 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 134 |
+
dict(
|
| 135 |
+
type='Collect',
|
| 136 |
+
keys=['img', 'target', 'target_weight'],
|
| 137 |
+
meta_keys=[
|
| 138 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 139 |
+
'flip_pairs', 'category_id',
|
| 140 |
+
'skeleton',
|
| 141 |
+
]),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
test_pipeline = valid_pipeline
|
| 145 |
+
|
| 146 |
+
data_root = 'data/mp100'
|
| 147 |
+
data = dict(
|
| 148 |
+
samples_per_gpu=16,
|
| 149 |
+
workers_per_gpu=16,
|
| 150 |
+
# samples_per_gpu=8,
|
| 151 |
+
# workers_per_gpu=8,
|
| 152 |
+
train=dict(
|
| 153 |
+
type='TransformerPoseDataset',
|
| 154 |
+
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
|
| 155 |
+
img_prefix=f'{data_root}/images/',
|
| 156 |
+
# img_prefix=f'{data_root}',
|
| 157 |
+
data_cfg=data_cfg,
|
| 158 |
+
valid_class_ids=None,
|
| 159 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 160 |
+
num_shots=1,
|
| 161 |
+
pipeline=train_pipeline),
|
| 162 |
+
val=dict(
|
| 163 |
+
type='TransformerPoseDataset',
|
| 164 |
+
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
|
| 165 |
+
img_prefix=f'{data_root}/images/',
|
| 166 |
+
# img_prefix=f'{data_root}',
|
| 167 |
+
data_cfg=data_cfg,
|
| 168 |
+
valid_class_ids=None,
|
| 169 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 170 |
+
num_shots=1,
|
| 171 |
+
num_queries=15,
|
| 172 |
+
num_episodes=100,
|
| 173 |
+
pipeline=valid_pipeline),
|
| 174 |
+
test=dict(
|
| 175 |
+
type='TestPoseDataset',
|
| 176 |
+
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
|
| 177 |
+
img_prefix=f'{data_root}/images/',
|
| 178 |
+
# img_prefix=f'{data_root}',
|
| 179 |
+
data_cfg=data_cfg,
|
| 180 |
+
valid_class_ids=None,
|
| 181 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 182 |
+
num_shots=1,
|
| 183 |
+
num_queries=15,
|
| 184 |
+
num_episodes=200,
|
| 185 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 186 |
+
pipeline=test_pipeline),
|
| 187 |
+
)
|
| 188 |
+
vis_backends = [
|
| 189 |
+
dict(type='LocalVisBackend'),
|
| 190 |
+
dict(type='TensorboardVisBackend'),
|
| 191 |
+
]
|
| 192 |
+
visualizer = dict(
|
| 193 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 194 |
+
|
| 195 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-clip/base_split4_config.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained="ViT-B/32",
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=512,
|
| 67 |
+
transformer=dict(
|
| 68 |
+
type='EncoderDecoder',
|
| 69 |
+
d_model=256,
|
| 70 |
+
nhead=8,
|
| 71 |
+
num_encoder_layers=3,
|
| 72 |
+
num_decoder_layers=3,
|
| 73 |
+
dim_feedforward=768,
|
| 74 |
+
dropout=0.1,
|
| 75 |
+
similarity_proj_dim=256,
|
| 76 |
+
dynamic_proj_dim=128,
|
| 77 |
+
activation="relu",
|
| 78 |
+
normalize_before=False,
|
| 79 |
+
return_intermediate_dec=True),
|
| 80 |
+
share_kpt_branch=False,
|
| 81 |
+
num_decoder_layer=3,
|
| 82 |
+
with_heatmap_loss=True,
|
| 83 |
+
|
| 84 |
+
heatmap_loss_weight=2.0,
|
| 85 |
+
support_order_dropout=-1,
|
| 86 |
+
positional_encoding=dict(
|
| 87 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 88 |
+
# training and testing settings
|
| 89 |
+
train_cfg=dict(),
|
| 90 |
+
test_cfg=dict(
|
| 91 |
+
flip_test=False,
|
| 92 |
+
post_process='default',
|
| 93 |
+
shift_heatmap=True,
|
| 94 |
+
modulate_kernel=11))
|
| 95 |
+
|
| 96 |
+
data_cfg = dict(
|
| 97 |
+
image_size=[256, 256],
|
| 98 |
+
heatmap_size=[64, 64],
|
| 99 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 100 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 101 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 102 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 103 |
+
|
| 104 |
+
train_pipeline = [
|
| 105 |
+
dict(type='LoadImageFromFile'),
|
| 106 |
+
dict(
|
| 107 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 108 |
+
scale_factor=0.15),
|
| 109 |
+
dict(type='TopDownAffineFewShot'),
|
| 110 |
+
dict(type='ToTensor'),
|
| 111 |
+
dict(
|
| 112 |
+
type='NormalizeTensor',
|
| 113 |
+
mean=[0.485, 0.456, 0.406],
|
| 114 |
+
std=[0.229, 0.224, 0.225]),
|
| 115 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 116 |
+
dict(
|
| 117 |
+
type='Collect',
|
| 118 |
+
keys=['img', 'target', 'target_weight'],
|
| 119 |
+
meta_keys=[
|
| 120 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 121 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 122 |
+
]),
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
valid_pipeline = [
|
| 126 |
+
dict(type='LoadImageFromFile'),
|
| 127 |
+
dict(type='TopDownAffineFewShot'),
|
| 128 |
+
dict(type='ToTensor'),
|
| 129 |
+
dict(
|
| 130 |
+
type='NormalizeTensor',
|
| 131 |
+
mean=[0.485, 0.456, 0.406],
|
| 132 |
+
std=[0.229, 0.224, 0.225]),
|
| 133 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 134 |
+
dict(
|
| 135 |
+
type='Collect',
|
| 136 |
+
keys=['img', 'target', 'target_weight'],
|
| 137 |
+
meta_keys=[
|
| 138 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 139 |
+
'flip_pairs', 'category_id',
|
| 140 |
+
'skeleton',
|
| 141 |
+
]),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
test_pipeline = valid_pipeline
|
| 145 |
+
|
| 146 |
+
data_root = 'data/mp100'
|
| 147 |
+
data = dict(
|
| 148 |
+
samples_per_gpu=16,
|
| 149 |
+
workers_per_gpu=16,
|
| 150 |
+
# samples_per_gpu=8,
|
| 151 |
+
# workers_per_gpu=8,
|
| 152 |
+
train=dict(
|
| 153 |
+
type='TransformerPoseDataset',
|
| 154 |
+
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
|
| 155 |
+
img_prefix=f'{data_root}/images/',
|
| 156 |
+
# img_prefix=f'{data_root}',
|
| 157 |
+
data_cfg=data_cfg,
|
| 158 |
+
valid_class_ids=None,
|
| 159 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 160 |
+
num_shots=1,
|
| 161 |
+
pipeline=train_pipeline),
|
| 162 |
+
val=dict(
|
| 163 |
+
type='TransformerPoseDataset',
|
| 164 |
+
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
|
| 165 |
+
img_prefix=f'{data_root}/images/',
|
| 166 |
+
# img_prefix=f'{data_root}',
|
| 167 |
+
data_cfg=data_cfg,
|
| 168 |
+
valid_class_ids=None,
|
| 169 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 170 |
+
num_shots=1,
|
| 171 |
+
num_queries=15,
|
| 172 |
+
num_episodes=100,
|
| 173 |
+
pipeline=valid_pipeline),
|
| 174 |
+
test=dict(
|
| 175 |
+
type='TestPoseDataset',
|
| 176 |
+
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
|
| 177 |
+
img_prefix=f'{data_root}/images/',
|
| 178 |
+
# img_prefix=f'{data_root}',
|
| 179 |
+
data_cfg=data_cfg,
|
| 180 |
+
valid_class_ids=None,
|
| 181 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 182 |
+
num_shots=1,
|
| 183 |
+
num_queries=15,
|
| 184 |
+
num_episodes=200,
|
| 185 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 186 |
+
pipeline=test_pipeline),
|
| 187 |
+
)
|
| 188 |
+
vis_backends = [
|
| 189 |
+
dict(type='LocalVisBackend'),
|
| 190 |
+
dict(type='TensorboardVisBackend'),
|
| 191 |
+
]
|
| 192 |
+
visualizer = dict(
|
| 193 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 194 |
+
|
| 195 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-clip/base_split5_config.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained="ViT-B/32",
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=512,
|
| 67 |
+
transformer=dict(
|
| 68 |
+
type='EncoderDecoder',
|
| 69 |
+
d_model=256,
|
| 70 |
+
nhead=8,
|
| 71 |
+
num_encoder_layers=3,
|
| 72 |
+
num_decoder_layers=3,
|
| 73 |
+
dim_feedforward=768,
|
| 74 |
+
dropout=0.1,
|
| 75 |
+
similarity_proj_dim=256,
|
| 76 |
+
dynamic_proj_dim=128,
|
| 77 |
+
activation="relu",
|
| 78 |
+
normalize_before=False,
|
| 79 |
+
return_intermediate_dec=True),
|
| 80 |
+
share_kpt_branch=False,
|
| 81 |
+
num_decoder_layer=3,
|
| 82 |
+
with_heatmap_loss=True,
|
| 83 |
+
|
| 84 |
+
heatmap_loss_weight=2.0,
|
| 85 |
+
support_order_dropout=-1,
|
| 86 |
+
positional_encoding=dict(
|
| 87 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 88 |
+
# training and testing settings
|
| 89 |
+
train_cfg=dict(),
|
| 90 |
+
test_cfg=dict(
|
| 91 |
+
flip_test=False,
|
| 92 |
+
post_process='default',
|
| 93 |
+
shift_heatmap=True,
|
| 94 |
+
modulate_kernel=11))
|
| 95 |
+
|
| 96 |
+
data_cfg = dict(
|
| 97 |
+
image_size=[256, 256],
|
| 98 |
+
heatmap_size=[64, 64],
|
| 99 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 100 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 101 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 102 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 103 |
+
|
| 104 |
+
train_pipeline = [
|
| 105 |
+
dict(type='LoadImageFromFile'),
|
| 106 |
+
dict(
|
| 107 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 108 |
+
scale_factor=0.15),
|
| 109 |
+
dict(type='TopDownAffineFewShot'),
|
| 110 |
+
dict(type='ToTensor'),
|
| 111 |
+
dict(
|
| 112 |
+
type='NormalizeTensor',
|
| 113 |
+
mean=[0.485, 0.456, 0.406],
|
| 114 |
+
std=[0.229, 0.224, 0.225]),
|
| 115 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 116 |
+
dict(
|
| 117 |
+
type='Collect',
|
| 118 |
+
keys=['img', 'target', 'target_weight'],
|
| 119 |
+
meta_keys=[
|
| 120 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 121 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 122 |
+
]),
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
valid_pipeline = [
|
| 126 |
+
dict(type='LoadImageFromFile'),
|
| 127 |
+
dict(type='TopDownAffineFewShot'),
|
| 128 |
+
dict(type='ToTensor'),
|
| 129 |
+
dict(
|
| 130 |
+
type='NormalizeTensor',
|
| 131 |
+
mean=[0.485, 0.456, 0.406],
|
| 132 |
+
std=[0.229, 0.224, 0.225]),
|
| 133 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 134 |
+
dict(
|
| 135 |
+
type='Collect',
|
| 136 |
+
keys=['img', 'target', 'target_weight'],
|
| 137 |
+
meta_keys=[
|
| 138 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 139 |
+
'flip_pairs', 'category_id',
|
| 140 |
+
'skeleton',
|
| 141 |
+
]),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
test_pipeline = valid_pipeline
|
| 145 |
+
|
| 146 |
+
data_root = 'data/mp100'
|
| 147 |
+
data = dict(
|
| 148 |
+
samples_per_gpu=16,
|
| 149 |
+
workers_per_gpu=16,
|
| 150 |
+
# samples_per_gpu=8,
|
| 151 |
+
# workers_per_gpu=8,
|
| 152 |
+
train=dict(
|
| 153 |
+
type='TransformerPoseDataset',
|
| 154 |
+
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
|
| 155 |
+
img_prefix=f'{data_root}/images/',
|
| 156 |
+
# img_prefix=f'{data_root}',
|
| 157 |
+
data_cfg=data_cfg,
|
| 158 |
+
valid_class_ids=None,
|
| 159 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 160 |
+
num_shots=1,
|
| 161 |
+
pipeline=train_pipeline),
|
| 162 |
+
val=dict(
|
| 163 |
+
type='TransformerPoseDataset',
|
| 164 |
+
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
|
| 165 |
+
img_prefix=f'{data_root}/images/',
|
| 166 |
+
# img_prefix=f'{data_root}',
|
| 167 |
+
data_cfg=data_cfg,
|
| 168 |
+
valid_class_ids=None,
|
| 169 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 170 |
+
num_shots=1,
|
| 171 |
+
num_queries=15,
|
| 172 |
+
num_episodes=100,
|
| 173 |
+
pipeline=valid_pipeline),
|
| 174 |
+
test=dict(
|
| 175 |
+
type='TestPoseDataset',
|
| 176 |
+
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
|
| 177 |
+
img_prefix=f'{data_root}/images/',
|
| 178 |
+
# img_prefix=f'{data_root}',
|
| 179 |
+
data_cfg=data_cfg,
|
| 180 |
+
valid_class_ids=None,
|
| 181 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 182 |
+
num_shots=1,
|
| 183 |
+
num_queries=15,
|
| 184 |
+
num_episodes=200,
|
| 185 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 186 |
+
pipeline=test_pipeline),
|
| 187 |
+
)
|
| 188 |
+
vis_backends = [
|
| 189 |
+
dict(type='LocalVisBackend'),
|
| 190 |
+
dict(type='TensorboardVisBackend'),
|
| 191 |
+
]
|
| 192 |
+
visualizer = dict(
|
| 193 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 194 |
+
|
| 195 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-clip/graph_split1_config.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
# total_epochs = 1
|
| 28 |
+
log_config = dict(
|
| 29 |
+
interval=50,
|
| 30 |
+
hooks=[
|
| 31 |
+
dict(type='TextLoggerHook'),
|
| 32 |
+
dict(type='TensorboardLoggerHook')
|
| 33 |
+
])
|
| 34 |
+
|
| 35 |
+
channel_cfg = dict(
|
| 36 |
+
num_output_channels=1,
|
| 37 |
+
dataset_joints=1,
|
| 38 |
+
dataset_channel=[
|
| 39 |
+
[
|
| 40 |
+
0,
|
| 41 |
+
],
|
| 42 |
+
],
|
| 43 |
+
inference_channel=[
|
| 44 |
+
0,
|
| 45 |
+
],
|
| 46 |
+
max_kpt_num=100)
|
| 47 |
+
|
| 48 |
+
# model settings
|
| 49 |
+
model = dict(
|
| 50 |
+
type='PoseAnythingModel',
|
| 51 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 52 |
+
text_pretrained="ViT-B/32",
|
| 53 |
+
finetune_text_pretrained=False,
|
| 54 |
+
encoder_config=dict(
|
| 55 |
+
type='SwinTransformerV2',
|
| 56 |
+
embed_dim=96,
|
| 57 |
+
depths=[2, 2, 18, 2],
|
| 58 |
+
num_heads=[3, 6, 12, 24],
|
| 59 |
+
window_size=16,
|
| 60 |
+
drop_path_rate=0.3,
|
| 61 |
+
img_size=256,
|
| 62 |
+
upsample="bilinear"
|
| 63 |
+
),
|
| 64 |
+
keypoint_head=dict(
|
| 65 |
+
type='PoseHead',
|
| 66 |
+
img_in_channels=768,
|
| 67 |
+
# text_in_channels=768,
|
| 68 |
+
text_in_channels=512,
|
| 69 |
+
transformer=dict(
|
| 70 |
+
type='EncoderDecoder',
|
| 71 |
+
d_model=256,
|
| 72 |
+
nhead=8,
|
| 73 |
+
num_encoder_layers=3,
|
| 74 |
+
num_decoder_layers=3,
|
| 75 |
+
graph_decoder='pre',
|
| 76 |
+
dim_feedforward=768,
|
| 77 |
+
dropout=0.1,
|
| 78 |
+
similarity_proj_dim=256,
|
| 79 |
+
dynamic_proj_dim=128,
|
| 80 |
+
activation="relu",
|
| 81 |
+
normalize_before=False,
|
| 82 |
+
return_intermediate_dec=True),
|
| 83 |
+
share_kpt_branch=False,
|
| 84 |
+
num_decoder_layer=3,
|
| 85 |
+
with_heatmap_loss=True,
|
| 86 |
+
|
| 87 |
+
heatmap_loss_weight=2.0,
|
| 88 |
+
support_order_dropout=-1,
|
| 89 |
+
positional_encoding=dict(
|
| 90 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 91 |
+
# training and testing settings
|
| 92 |
+
train_cfg=dict(),
|
| 93 |
+
test_cfg=dict(
|
| 94 |
+
flip_test=False,
|
| 95 |
+
post_process='default',
|
| 96 |
+
shift_heatmap=True,
|
| 97 |
+
modulate_kernel=11))
|
| 98 |
+
|
| 99 |
+
data_cfg = dict(
|
| 100 |
+
image_size=[256, 256],
|
| 101 |
+
heatmap_size=[64, 64],
|
| 102 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 103 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 104 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 105 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 106 |
+
|
| 107 |
+
train_pipeline = [
|
| 108 |
+
dict(type='LoadImageFromFile'),
|
| 109 |
+
dict(
|
| 110 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 111 |
+
scale_factor=0.15),
|
| 112 |
+
dict(type='TopDownAffineFewShot'),
|
| 113 |
+
dict(type='ToTensor'),
|
| 114 |
+
dict(
|
| 115 |
+
type='NormalizeTensor',
|
| 116 |
+
mean=[0.485, 0.456, 0.406],
|
| 117 |
+
std=[0.229, 0.224, 0.225]),
|
| 118 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 119 |
+
dict(
|
| 120 |
+
type='Collect',
|
| 121 |
+
keys=['img', 'target', 'target_weight'],
|
| 122 |
+
meta_keys=[
|
| 123 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 124 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 125 |
+
]),
|
| 126 |
+
]
|
| 127 |
+
|
| 128 |
+
valid_pipeline = [
|
| 129 |
+
dict(type='LoadImageFromFile'),
|
| 130 |
+
dict(type='TopDownAffineFewShot'),
|
| 131 |
+
dict(type='ToTensor'),
|
| 132 |
+
dict(
|
| 133 |
+
type='NormalizeTensor',
|
| 134 |
+
mean=[0.485, 0.456, 0.406],
|
| 135 |
+
std=[0.229, 0.224, 0.225]),
|
| 136 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 137 |
+
dict(
|
| 138 |
+
type='Collect',
|
| 139 |
+
keys=['img', 'target', 'target_weight'],
|
| 140 |
+
meta_keys=[
|
| 141 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 142 |
+
'flip_pairs', 'category_id',
|
| 143 |
+
'skeleton',
|
| 144 |
+
]),
|
| 145 |
+
]
|
| 146 |
+
|
| 147 |
+
test_pipeline = valid_pipeline
|
| 148 |
+
|
| 149 |
+
data_root = 'data/mp100'
|
| 150 |
+
data = dict(
|
| 151 |
+
samples_per_gpu=16,
|
| 152 |
+
workers_per_gpu=16,
|
| 153 |
+
# samples_per_gpu=8,
|
| 154 |
+
# workers_per_gpu=8,
|
| 155 |
+
train=dict(
|
| 156 |
+
type='TransformerPoseDataset',
|
| 157 |
+
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
|
| 158 |
+
img_prefix=f'{data_root}/images/',
|
| 159 |
+
# img_prefix=f'{data_root}',
|
| 160 |
+
data_cfg=data_cfg,
|
| 161 |
+
valid_class_ids=None,
|
| 162 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 163 |
+
num_shots=1,
|
| 164 |
+
pipeline=train_pipeline),
|
| 165 |
+
val=dict(
|
| 166 |
+
type='TransformerPoseDataset',
|
| 167 |
+
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
|
| 168 |
+
img_prefix=f'{data_root}/images/',
|
| 169 |
+
# img_prefix=f'{data_root}',
|
| 170 |
+
data_cfg=data_cfg,
|
| 171 |
+
valid_class_ids=None,
|
| 172 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 173 |
+
num_shots=1,
|
| 174 |
+
num_queries=15,
|
| 175 |
+
num_episodes=100,
|
| 176 |
+
pipeline=valid_pipeline),
|
| 177 |
+
test=dict(
|
| 178 |
+
type='TestPoseDataset',
|
| 179 |
+
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
|
| 180 |
+
img_prefix=f'{data_root}/images/',
|
| 181 |
+
# img_prefix=f'{data_root}',
|
| 182 |
+
data_cfg=data_cfg,
|
| 183 |
+
valid_class_ids=None,
|
| 184 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 185 |
+
num_shots=1,
|
| 186 |
+
num_queries=15,
|
| 187 |
+
num_episodes=200,
|
| 188 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 189 |
+
pipeline=test_pipeline),
|
| 190 |
+
)
|
| 191 |
+
vis_backends = [
|
| 192 |
+
dict(type='LocalVisBackend'),
|
| 193 |
+
dict(type='TensorboardVisBackend'),
|
| 194 |
+
]
|
| 195 |
+
visualizer = dict(
|
| 196 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 197 |
+
|
| 198 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-clip/graph_split2_config.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained="ViT-B/32",
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
# text_in_channels=768,
|
| 67 |
+
text_in_channels=512,
|
| 68 |
+
transformer=dict(
|
| 69 |
+
type='EncoderDecoder',
|
| 70 |
+
d_model=256,
|
| 71 |
+
nhead=8,
|
| 72 |
+
num_encoder_layers=3,
|
| 73 |
+
num_decoder_layers=3,
|
| 74 |
+
graph_decoder='pre',
|
| 75 |
+
dim_feedforward=768,
|
| 76 |
+
dropout=0.1,
|
| 77 |
+
similarity_proj_dim=256,
|
| 78 |
+
dynamic_proj_dim=128,
|
| 79 |
+
activation="relu",
|
| 80 |
+
normalize_before=False,
|
| 81 |
+
return_intermediate_dec=True),
|
| 82 |
+
share_kpt_branch=False,
|
| 83 |
+
num_decoder_layer=3,
|
| 84 |
+
with_heatmap_loss=True,
|
| 85 |
+
|
| 86 |
+
heatmap_loss_weight=2.0,
|
| 87 |
+
support_order_dropout=-1,
|
| 88 |
+
positional_encoding=dict(
|
| 89 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 90 |
+
# training and testing settings
|
| 91 |
+
train_cfg=dict(),
|
| 92 |
+
test_cfg=dict(
|
| 93 |
+
flip_test=False,
|
| 94 |
+
post_process='default',
|
| 95 |
+
shift_heatmap=True,
|
| 96 |
+
modulate_kernel=11))
|
| 97 |
+
|
| 98 |
+
data_cfg = dict(
|
| 99 |
+
image_size=[256, 256],
|
| 100 |
+
heatmap_size=[64, 64],
|
| 101 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 102 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 103 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 104 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 105 |
+
|
| 106 |
+
train_pipeline = [
|
| 107 |
+
dict(type='LoadImageFromFile'),
|
| 108 |
+
dict(
|
| 109 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 110 |
+
scale_factor=0.15),
|
| 111 |
+
dict(type='TopDownAffineFewShot'),
|
| 112 |
+
dict(type='ToTensor'),
|
| 113 |
+
dict(
|
| 114 |
+
type='NormalizeTensor',
|
| 115 |
+
mean=[0.485, 0.456, 0.406],
|
| 116 |
+
std=[0.229, 0.224, 0.225]),
|
| 117 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 118 |
+
dict(
|
| 119 |
+
type='Collect',
|
| 120 |
+
keys=['img', 'target', 'target_weight'],
|
| 121 |
+
meta_keys=[
|
| 122 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 123 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 124 |
+
]),
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
valid_pipeline = [
|
| 128 |
+
dict(type='LoadImageFromFile'),
|
| 129 |
+
dict(type='TopDownAffineFewShot'),
|
| 130 |
+
dict(type='ToTensor'),
|
| 131 |
+
dict(
|
| 132 |
+
type='NormalizeTensor',
|
| 133 |
+
mean=[0.485, 0.456, 0.406],
|
| 134 |
+
std=[0.229, 0.224, 0.225]),
|
| 135 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 136 |
+
dict(
|
| 137 |
+
type='Collect',
|
| 138 |
+
keys=['img', 'target', 'target_weight'],
|
| 139 |
+
meta_keys=[
|
| 140 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 141 |
+
'flip_pairs', 'category_id',
|
| 142 |
+
'skeleton',
|
| 143 |
+
]),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
test_pipeline = valid_pipeline
|
| 147 |
+
|
| 148 |
+
data_root = 'data/mp100'
|
| 149 |
+
data = dict(
|
| 150 |
+
samples_per_gpu=16,
|
| 151 |
+
workers_per_gpu=16,
|
| 152 |
+
# samples_per_gpu=8,
|
| 153 |
+
# workers_per_gpu=8,
|
| 154 |
+
train=dict(
|
| 155 |
+
type='TransformerPoseDataset',
|
| 156 |
+
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
|
| 157 |
+
img_prefix=f'{data_root}/images/',
|
| 158 |
+
# img_prefix=f'{data_root}',
|
| 159 |
+
data_cfg=data_cfg,
|
| 160 |
+
valid_class_ids=None,
|
| 161 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 162 |
+
num_shots=1,
|
| 163 |
+
pipeline=train_pipeline),
|
| 164 |
+
val=dict(
|
| 165 |
+
type='TransformerPoseDataset',
|
| 166 |
+
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
|
| 167 |
+
img_prefix=f'{data_root}/images/',
|
| 168 |
+
# img_prefix=f'{data_root}',
|
| 169 |
+
data_cfg=data_cfg,
|
| 170 |
+
valid_class_ids=None,
|
| 171 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 172 |
+
num_shots=1,
|
| 173 |
+
num_queries=15,
|
| 174 |
+
num_episodes=100,
|
| 175 |
+
pipeline=valid_pipeline),
|
| 176 |
+
test=dict(
|
| 177 |
+
type='TestPoseDataset',
|
| 178 |
+
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
|
| 179 |
+
img_prefix=f'{data_root}/images/',
|
| 180 |
+
# img_prefix=f'{data_root}',
|
| 181 |
+
data_cfg=data_cfg,
|
| 182 |
+
valid_class_ids=None,
|
| 183 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 184 |
+
num_shots=1,
|
| 185 |
+
num_queries=15,
|
| 186 |
+
num_episodes=200,
|
| 187 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 188 |
+
pipeline=test_pipeline),
|
| 189 |
+
)
|
| 190 |
+
vis_backends = [
|
| 191 |
+
dict(type='LocalVisBackend'),
|
| 192 |
+
dict(type='TensorboardVisBackend'),
|
| 193 |
+
]
|
| 194 |
+
visualizer = dict(
|
| 195 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 196 |
+
|
| 197 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-clip/graph_split3_config.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained="ViT-B/32",
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
# text_in_channels=768,
|
| 67 |
+
text_in_channels=512,
|
| 68 |
+
transformer=dict(
|
| 69 |
+
type='EncoderDecoder',
|
| 70 |
+
d_model=256,
|
| 71 |
+
nhead=8,
|
| 72 |
+
num_encoder_layers=3,
|
| 73 |
+
num_decoder_layers=3,
|
| 74 |
+
graph_decoder='pre',
|
| 75 |
+
dim_feedforward=768,
|
| 76 |
+
dropout=0.1,
|
| 77 |
+
similarity_proj_dim=256,
|
| 78 |
+
dynamic_proj_dim=128,
|
| 79 |
+
activation="relu",
|
| 80 |
+
normalize_before=False,
|
| 81 |
+
return_intermediate_dec=True),
|
| 82 |
+
share_kpt_branch=False,
|
| 83 |
+
num_decoder_layer=3,
|
| 84 |
+
with_heatmap_loss=True,
|
| 85 |
+
|
| 86 |
+
heatmap_loss_weight=2.0,
|
| 87 |
+
support_order_dropout=-1,
|
| 88 |
+
positional_encoding=dict(
|
| 89 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 90 |
+
# training and testing settings
|
| 91 |
+
train_cfg=dict(),
|
| 92 |
+
test_cfg=dict(
|
| 93 |
+
flip_test=False,
|
| 94 |
+
post_process='default',
|
| 95 |
+
shift_heatmap=True,
|
| 96 |
+
modulate_kernel=11))
|
| 97 |
+
|
| 98 |
+
data_cfg = dict(
|
| 99 |
+
image_size=[256, 256],
|
| 100 |
+
heatmap_size=[64, 64],
|
| 101 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 102 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 103 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 104 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 105 |
+
|
| 106 |
+
train_pipeline = [
|
| 107 |
+
dict(type='LoadImageFromFile'),
|
| 108 |
+
dict(
|
| 109 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 110 |
+
scale_factor=0.15),
|
| 111 |
+
dict(type='TopDownAffineFewShot'),
|
| 112 |
+
dict(type='ToTensor'),
|
| 113 |
+
dict(
|
| 114 |
+
type='NormalizeTensor',
|
| 115 |
+
mean=[0.485, 0.456, 0.406],
|
| 116 |
+
std=[0.229, 0.224, 0.225]),
|
| 117 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 118 |
+
dict(
|
| 119 |
+
type='Collect',
|
| 120 |
+
keys=['img', 'target', 'target_weight'],
|
| 121 |
+
meta_keys=[
|
| 122 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 123 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 124 |
+
]),
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
valid_pipeline = [
|
| 128 |
+
dict(type='LoadImageFromFile'),
|
| 129 |
+
dict(type='TopDownAffineFewShot'),
|
| 130 |
+
dict(type='ToTensor'),
|
| 131 |
+
dict(
|
| 132 |
+
type='NormalizeTensor',
|
| 133 |
+
mean=[0.485, 0.456, 0.406],
|
| 134 |
+
std=[0.229, 0.224, 0.225]),
|
| 135 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 136 |
+
dict(
|
| 137 |
+
type='Collect',
|
| 138 |
+
keys=['img', 'target', 'target_weight'],
|
| 139 |
+
meta_keys=[
|
| 140 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 141 |
+
'flip_pairs', 'category_id',
|
| 142 |
+
'skeleton',
|
| 143 |
+
]),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
test_pipeline = valid_pipeline
|
| 147 |
+
|
| 148 |
+
data_root = 'data/mp100'
|
| 149 |
+
data = dict(
|
| 150 |
+
samples_per_gpu=16,
|
| 151 |
+
workers_per_gpu=16,
|
| 152 |
+
# samples_per_gpu=8,
|
| 153 |
+
# workers_per_gpu=8,
|
| 154 |
+
train=dict(
|
| 155 |
+
type='TransformerPoseDataset',
|
| 156 |
+
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
|
| 157 |
+
img_prefix=f'{data_root}/images/',
|
| 158 |
+
# img_prefix=f'{data_root}',
|
| 159 |
+
data_cfg=data_cfg,
|
| 160 |
+
valid_class_ids=None,
|
| 161 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 162 |
+
num_shots=1,
|
| 163 |
+
pipeline=train_pipeline),
|
| 164 |
+
val=dict(
|
| 165 |
+
type='TransformerPoseDataset',
|
| 166 |
+
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
|
| 167 |
+
img_prefix=f'{data_root}/images/',
|
| 168 |
+
# img_prefix=f'{data_root}',
|
| 169 |
+
data_cfg=data_cfg,
|
| 170 |
+
valid_class_ids=None,
|
| 171 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 172 |
+
num_shots=1,
|
| 173 |
+
num_queries=15,
|
| 174 |
+
num_episodes=100,
|
| 175 |
+
pipeline=valid_pipeline),
|
| 176 |
+
test=dict(
|
| 177 |
+
type='TestPoseDataset',
|
| 178 |
+
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
|
| 179 |
+
img_prefix=f'{data_root}/images/',
|
| 180 |
+
# img_prefix=f'{data_root}',
|
| 181 |
+
data_cfg=data_cfg,
|
| 182 |
+
valid_class_ids=None,
|
| 183 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 184 |
+
num_shots=1,
|
| 185 |
+
num_queries=15,
|
| 186 |
+
num_episodes=200,
|
| 187 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 188 |
+
pipeline=test_pipeline),
|
| 189 |
+
)
|
| 190 |
+
vis_backends = [
|
| 191 |
+
dict(type='LocalVisBackend'),
|
| 192 |
+
dict(type='TensorboardVisBackend'),
|
| 193 |
+
]
|
| 194 |
+
visualizer = dict(
|
| 195 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 196 |
+
|
| 197 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-clip/graph_split4_config.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained="ViT-B/32",
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
# text_in_channels=768,
|
| 67 |
+
text_in_channels=512,
|
| 68 |
+
transformer=dict(
|
| 69 |
+
type='EncoderDecoder',
|
| 70 |
+
d_model=256,
|
| 71 |
+
nhead=8,
|
| 72 |
+
num_encoder_layers=3,
|
| 73 |
+
num_decoder_layers=3,
|
| 74 |
+
graph_decoder='pre',
|
| 75 |
+
dim_feedforward=768,
|
| 76 |
+
dropout=0.1,
|
| 77 |
+
similarity_proj_dim=256,
|
| 78 |
+
dynamic_proj_dim=128,
|
| 79 |
+
activation="relu",
|
| 80 |
+
normalize_before=False,
|
| 81 |
+
return_intermediate_dec=True),
|
| 82 |
+
share_kpt_branch=False,
|
| 83 |
+
num_decoder_layer=3,
|
| 84 |
+
with_heatmap_loss=True,
|
| 85 |
+
|
| 86 |
+
heatmap_loss_weight=2.0,
|
| 87 |
+
support_order_dropout=-1,
|
| 88 |
+
positional_encoding=dict(
|
| 89 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 90 |
+
# training and testing settings
|
| 91 |
+
train_cfg=dict(),
|
| 92 |
+
test_cfg=dict(
|
| 93 |
+
flip_test=False,
|
| 94 |
+
post_process='default',
|
| 95 |
+
shift_heatmap=True,
|
| 96 |
+
modulate_kernel=11))
|
| 97 |
+
|
| 98 |
+
data_cfg = dict(
|
| 99 |
+
image_size=[256, 256],
|
| 100 |
+
heatmap_size=[64, 64],
|
| 101 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 102 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 103 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 104 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 105 |
+
|
| 106 |
+
train_pipeline = [
|
| 107 |
+
dict(type='LoadImageFromFile'),
|
| 108 |
+
dict(
|
| 109 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 110 |
+
scale_factor=0.15),
|
| 111 |
+
dict(type='TopDownAffineFewShot'),
|
| 112 |
+
dict(type='ToTensor'),
|
| 113 |
+
dict(
|
| 114 |
+
type='NormalizeTensor',
|
| 115 |
+
mean=[0.485, 0.456, 0.406],
|
| 116 |
+
std=[0.229, 0.224, 0.225]),
|
| 117 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 118 |
+
dict(
|
| 119 |
+
type='Collect',
|
| 120 |
+
keys=['img', 'target', 'target_weight'],
|
| 121 |
+
meta_keys=[
|
| 122 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 123 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 124 |
+
]),
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
valid_pipeline = [
|
| 128 |
+
dict(type='LoadImageFromFile'),
|
| 129 |
+
dict(type='TopDownAffineFewShot'),
|
| 130 |
+
dict(type='ToTensor'),
|
| 131 |
+
dict(
|
| 132 |
+
type='NormalizeTensor',
|
| 133 |
+
mean=[0.485, 0.456, 0.406],
|
| 134 |
+
std=[0.229, 0.224, 0.225]),
|
| 135 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 136 |
+
dict(
|
| 137 |
+
type='Collect',
|
| 138 |
+
keys=['img', 'target', 'target_weight'],
|
| 139 |
+
meta_keys=[
|
| 140 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 141 |
+
'flip_pairs', 'category_id',
|
| 142 |
+
'skeleton',
|
| 143 |
+
]),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
test_pipeline = valid_pipeline
|
| 147 |
+
|
| 148 |
+
data_root = 'data/mp100'
|
| 149 |
+
data = dict(
|
| 150 |
+
samples_per_gpu=16,
|
| 151 |
+
workers_per_gpu=16,
|
| 152 |
+
# samples_per_gpu=8,
|
| 153 |
+
# workers_per_gpu=8,
|
| 154 |
+
train=dict(
|
| 155 |
+
type='TransformerPoseDataset',
|
| 156 |
+
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
|
| 157 |
+
img_prefix=f'{data_root}/images/',
|
| 158 |
+
# img_prefix=f'{data_root}',
|
| 159 |
+
data_cfg=data_cfg,
|
| 160 |
+
valid_class_ids=None,
|
| 161 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 162 |
+
num_shots=1,
|
| 163 |
+
pipeline=train_pipeline),
|
| 164 |
+
val=dict(
|
| 165 |
+
type='TransformerPoseDataset',
|
| 166 |
+
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
|
| 167 |
+
img_prefix=f'{data_root}/images/',
|
| 168 |
+
# img_prefix=f'{data_root}',
|
| 169 |
+
data_cfg=data_cfg,
|
| 170 |
+
valid_class_ids=None,
|
| 171 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 172 |
+
num_shots=1,
|
| 173 |
+
num_queries=15,
|
| 174 |
+
num_episodes=100,
|
| 175 |
+
pipeline=valid_pipeline),
|
| 176 |
+
test=dict(
|
| 177 |
+
type='TestPoseDataset',
|
| 178 |
+
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
|
| 179 |
+
img_prefix=f'{data_root}/images/',
|
| 180 |
+
# img_prefix=f'{data_root}',
|
| 181 |
+
data_cfg=data_cfg,
|
| 182 |
+
valid_class_ids=None,
|
| 183 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 184 |
+
num_shots=1,
|
| 185 |
+
num_queries=15,
|
| 186 |
+
num_episodes=200,
|
| 187 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 188 |
+
pipeline=test_pipeline),
|
| 189 |
+
)
|
| 190 |
+
vis_backends = [
|
| 191 |
+
dict(type='LocalVisBackend'),
|
| 192 |
+
dict(type='TensorboardVisBackend'),
|
| 193 |
+
]
|
| 194 |
+
visualizer = dict(
|
| 195 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 196 |
+
|
| 197 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-clip/graph_split5_config.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained="ViT-B/32",
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
# text_in_channels=768,
|
| 67 |
+
text_in_channels=512,
|
| 68 |
+
transformer=dict(
|
| 69 |
+
type='EncoderDecoder',
|
| 70 |
+
d_model=256,
|
| 71 |
+
nhead=8,
|
| 72 |
+
num_encoder_layers=3,
|
| 73 |
+
num_decoder_layers=3,
|
| 74 |
+
graph_decoder='pre',
|
| 75 |
+
dim_feedforward=768,
|
| 76 |
+
dropout=0.1,
|
| 77 |
+
similarity_proj_dim=256,
|
| 78 |
+
dynamic_proj_dim=128,
|
| 79 |
+
activation="relu",
|
| 80 |
+
normalize_before=False,
|
| 81 |
+
return_intermediate_dec=True),
|
| 82 |
+
share_kpt_branch=False,
|
| 83 |
+
num_decoder_layer=3,
|
| 84 |
+
with_heatmap_loss=True,
|
| 85 |
+
|
| 86 |
+
heatmap_loss_weight=2.0,
|
| 87 |
+
support_order_dropout=-1,
|
| 88 |
+
positional_encoding=dict(
|
| 89 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 90 |
+
# training and testing settings
|
| 91 |
+
train_cfg=dict(),
|
| 92 |
+
test_cfg=dict(
|
| 93 |
+
flip_test=False,
|
| 94 |
+
post_process='default',
|
| 95 |
+
shift_heatmap=True,
|
| 96 |
+
modulate_kernel=11))
|
| 97 |
+
|
| 98 |
+
data_cfg = dict(
|
| 99 |
+
image_size=[256, 256],
|
| 100 |
+
heatmap_size=[64, 64],
|
| 101 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 102 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 103 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 104 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 105 |
+
|
| 106 |
+
train_pipeline = [
|
| 107 |
+
dict(type='LoadImageFromFile'),
|
| 108 |
+
dict(
|
| 109 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 110 |
+
scale_factor=0.15),
|
| 111 |
+
dict(type='TopDownAffineFewShot'),
|
| 112 |
+
dict(type='ToTensor'),
|
| 113 |
+
dict(
|
| 114 |
+
type='NormalizeTensor',
|
| 115 |
+
mean=[0.485, 0.456, 0.406],
|
| 116 |
+
std=[0.229, 0.224, 0.225]),
|
| 117 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 118 |
+
dict(
|
| 119 |
+
type='Collect',
|
| 120 |
+
keys=['img', 'target', 'target_weight'],
|
| 121 |
+
meta_keys=[
|
| 122 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 123 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 124 |
+
]),
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
valid_pipeline = [
|
| 128 |
+
dict(type='LoadImageFromFile'),
|
| 129 |
+
dict(type='TopDownAffineFewShot'),
|
| 130 |
+
dict(type='ToTensor'),
|
| 131 |
+
dict(
|
| 132 |
+
type='NormalizeTensor',
|
| 133 |
+
mean=[0.485, 0.456, 0.406],
|
| 134 |
+
std=[0.229, 0.224, 0.225]),
|
| 135 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 136 |
+
dict(
|
| 137 |
+
type='Collect',
|
| 138 |
+
keys=['img', 'target', 'target_weight'],
|
| 139 |
+
meta_keys=[
|
| 140 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 141 |
+
'flip_pairs', 'category_id',
|
| 142 |
+
'skeleton',
|
| 143 |
+
]),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
test_pipeline = valid_pipeline
|
| 147 |
+
|
| 148 |
+
data_root = 'data/mp100'
|
| 149 |
+
data = dict(
|
| 150 |
+
samples_per_gpu=16,
|
| 151 |
+
workers_per_gpu=16,
|
| 152 |
+
# samples_per_gpu=8,
|
| 153 |
+
# workers_per_gpu=8,
|
| 154 |
+
train=dict(
|
| 155 |
+
type='TransformerPoseDataset',
|
| 156 |
+
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
|
| 157 |
+
img_prefix=f'{data_root}/images/',
|
| 158 |
+
# img_prefix=f'{data_root}',
|
| 159 |
+
data_cfg=data_cfg,
|
| 160 |
+
valid_class_ids=None,
|
| 161 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 162 |
+
num_shots=1,
|
| 163 |
+
pipeline=train_pipeline),
|
| 164 |
+
val=dict(
|
| 165 |
+
type='TransformerPoseDataset',
|
| 166 |
+
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
|
| 167 |
+
img_prefix=f'{data_root}/images/',
|
| 168 |
+
# img_prefix=f'{data_root}',
|
| 169 |
+
data_cfg=data_cfg,
|
| 170 |
+
valid_class_ids=None,
|
| 171 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 172 |
+
num_shots=1,
|
| 173 |
+
num_queries=15,
|
| 174 |
+
num_episodes=100,
|
| 175 |
+
pipeline=valid_pipeline),
|
| 176 |
+
test=dict(
|
| 177 |
+
type='TestPoseDataset',
|
| 178 |
+
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
|
| 179 |
+
img_prefix=f'{data_root}/images/',
|
| 180 |
+
# img_prefix=f'{data_root}',
|
| 181 |
+
data_cfg=data_cfg,
|
| 182 |
+
valid_class_ids=None,
|
| 183 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 184 |
+
num_shots=1,
|
| 185 |
+
num_queries=15,
|
| 186 |
+
num_episodes=200,
|
| 187 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 188 |
+
pipeline=test_pipeline),
|
| 189 |
+
)
|
| 190 |
+
vis_backends = [
|
| 191 |
+
dict(type='LocalVisBackend'),
|
| 192 |
+
dict(type='TensorboardVisBackend'),
|
| 193 |
+
]
|
| 194 |
+
visualizer = dict(
|
| 195 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 196 |
+
|
| 197 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-gte/base_split1_config.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=768,
|
| 67 |
+
transformer=dict(
|
| 68 |
+
type='EncoderDecoder',
|
| 69 |
+
d_model=256,
|
| 70 |
+
nhead=8,
|
| 71 |
+
num_encoder_layers=3,
|
| 72 |
+
num_decoder_layers=3,
|
| 73 |
+
dim_feedforward=768,
|
| 74 |
+
dropout=0.1,
|
| 75 |
+
similarity_proj_dim=256,
|
| 76 |
+
dynamic_proj_dim=128,
|
| 77 |
+
activation="relu",
|
| 78 |
+
normalize_before=False,
|
| 79 |
+
return_intermediate_dec=True),
|
| 80 |
+
share_kpt_branch=False,
|
| 81 |
+
num_decoder_layer=3,
|
| 82 |
+
with_heatmap_loss=True,
|
| 83 |
+
|
| 84 |
+
heatmap_loss_weight=2.0,
|
| 85 |
+
support_order_dropout=-1,
|
| 86 |
+
positional_encoding=dict(
|
| 87 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 88 |
+
# training and testing settings
|
| 89 |
+
train_cfg=dict(),
|
| 90 |
+
test_cfg=dict(
|
| 91 |
+
flip_test=False,
|
| 92 |
+
post_process='default',
|
| 93 |
+
shift_heatmap=True,
|
| 94 |
+
modulate_kernel=11))
|
| 95 |
+
|
| 96 |
+
data_cfg = dict(
|
| 97 |
+
image_size=[256, 256],
|
| 98 |
+
heatmap_size=[64, 64],
|
| 99 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 100 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 101 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 102 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 103 |
+
|
| 104 |
+
train_pipeline = [
|
| 105 |
+
dict(type='LoadImageFromFile'),
|
| 106 |
+
dict(
|
| 107 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 108 |
+
scale_factor=0.15),
|
| 109 |
+
dict(type='TopDownAffineFewShot'),
|
| 110 |
+
dict(type='ToTensor'),
|
| 111 |
+
dict(
|
| 112 |
+
type='NormalizeTensor',
|
| 113 |
+
mean=[0.485, 0.456, 0.406],
|
| 114 |
+
std=[0.229, 0.224, 0.225]),
|
| 115 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 116 |
+
dict(
|
| 117 |
+
type='Collect',
|
| 118 |
+
keys=['img', 'target', 'target_weight'],
|
| 119 |
+
meta_keys=[
|
| 120 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 121 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 122 |
+
]),
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
valid_pipeline = [
|
| 126 |
+
dict(type='LoadImageFromFile'),
|
| 127 |
+
dict(type='TopDownAffineFewShot'),
|
| 128 |
+
dict(type='ToTensor'),
|
| 129 |
+
dict(
|
| 130 |
+
type='NormalizeTensor',
|
| 131 |
+
mean=[0.485, 0.456, 0.406],
|
| 132 |
+
std=[0.229, 0.224, 0.225]),
|
| 133 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 134 |
+
dict(
|
| 135 |
+
type='Collect',
|
| 136 |
+
keys=['img', 'target', 'target_weight'],
|
| 137 |
+
meta_keys=[
|
| 138 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 139 |
+
'flip_pairs', 'category_id',
|
| 140 |
+
'skeleton',
|
| 141 |
+
]),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
test_pipeline = valid_pipeline
|
| 145 |
+
|
| 146 |
+
data_root = 'data/mp100'
|
| 147 |
+
data = dict(
|
| 148 |
+
samples_per_gpu=16,
|
| 149 |
+
workers_per_gpu=16,
|
| 150 |
+
# samples_per_gpu=8,
|
| 151 |
+
# workers_per_gpu=8,
|
| 152 |
+
train=dict(
|
| 153 |
+
type='TransformerPoseDataset',
|
| 154 |
+
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
|
| 155 |
+
img_prefix=f'{data_root}/images/',
|
| 156 |
+
# img_prefix=f'{data_root}',
|
| 157 |
+
data_cfg=data_cfg,
|
| 158 |
+
valid_class_ids=None,
|
| 159 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 160 |
+
num_shots=1,
|
| 161 |
+
pipeline=train_pipeline),
|
| 162 |
+
val=dict(
|
| 163 |
+
type='TransformerPoseDataset',
|
| 164 |
+
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
|
| 165 |
+
img_prefix=f'{data_root}/images/',
|
| 166 |
+
# img_prefix=f'{data_root}',
|
| 167 |
+
data_cfg=data_cfg,
|
| 168 |
+
valid_class_ids=None,
|
| 169 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 170 |
+
num_shots=1,
|
| 171 |
+
num_queries=15,
|
| 172 |
+
num_episodes=100,
|
| 173 |
+
pipeline=valid_pipeline),
|
| 174 |
+
test=dict(
|
| 175 |
+
type='TestPoseDataset',
|
| 176 |
+
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
|
| 177 |
+
img_prefix=f'{data_root}/images/',
|
| 178 |
+
# img_prefix=f'{data_root}',
|
| 179 |
+
data_cfg=data_cfg,
|
| 180 |
+
valid_class_ids=None,
|
| 181 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 182 |
+
num_shots=1,
|
| 183 |
+
num_queries=15,
|
| 184 |
+
num_episodes=200,
|
| 185 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 186 |
+
pipeline=test_pipeline),
|
| 187 |
+
)
|
| 188 |
+
vis_backends = [
|
| 189 |
+
dict(type='LocalVisBackend'),
|
| 190 |
+
dict(type='TensorboardVisBackend'),
|
| 191 |
+
]
|
| 192 |
+
visualizer = dict(
|
| 193 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 194 |
+
|
| 195 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-gte/base_split2_config.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=768,
|
| 67 |
+
transformer=dict(
|
| 68 |
+
type='EncoderDecoder',
|
| 69 |
+
d_model=256,
|
| 70 |
+
nhead=8,
|
| 71 |
+
num_encoder_layers=3,
|
| 72 |
+
num_decoder_layers=3,
|
| 73 |
+
dim_feedforward=768,
|
| 74 |
+
dropout=0.1,
|
| 75 |
+
similarity_proj_dim=256,
|
| 76 |
+
dynamic_proj_dim=128,
|
| 77 |
+
activation="relu",
|
| 78 |
+
normalize_before=False,
|
| 79 |
+
return_intermediate_dec=True),
|
| 80 |
+
share_kpt_branch=False,
|
| 81 |
+
num_decoder_layer=3,
|
| 82 |
+
with_heatmap_loss=True,
|
| 83 |
+
|
| 84 |
+
heatmap_loss_weight=2.0,
|
| 85 |
+
support_order_dropout=-1,
|
| 86 |
+
positional_encoding=dict(
|
| 87 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 88 |
+
# training and testing settings
|
| 89 |
+
train_cfg=dict(),
|
| 90 |
+
test_cfg=dict(
|
| 91 |
+
flip_test=False,
|
| 92 |
+
post_process='default',
|
| 93 |
+
shift_heatmap=True,
|
| 94 |
+
modulate_kernel=11))
|
| 95 |
+
|
| 96 |
+
data_cfg = dict(
|
| 97 |
+
image_size=[256, 256],
|
| 98 |
+
heatmap_size=[64, 64],
|
| 99 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 100 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 101 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 102 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 103 |
+
|
| 104 |
+
train_pipeline = [
|
| 105 |
+
dict(type='LoadImageFromFile'),
|
| 106 |
+
dict(
|
| 107 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 108 |
+
scale_factor=0.15),
|
| 109 |
+
dict(type='TopDownAffineFewShot'),
|
| 110 |
+
dict(type='ToTensor'),
|
| 111 |
+
dict(
|
| 112 |
+
type='NormalizeTensor',
|
| 113 |
+
mean=[0.485, 0.456, 0.406],
|
| 114 |
+
std=[0.229, 0.224, 0.225]),
|
| 115 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 116 |
+
dict(
|
| 117 |
+
type='Collect',
|
| 118 |
+
keys=['img', 'target', 'target_weight'],
|
| 119 |
+
meta_keys=[
|
| 120 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 121 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 122 |
+
]),
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
valid_pipeline = [
|
| 126 |
+
dict(type='LoadImageFromFile'),
|
| 127 |
+
dict(type='TopDownAffineFewShot'),
|
| 128 |
+
dict(type='ToTensor'),
|
| 129 |
+
dict(
|
| 130 |
+
type='NormalizeTensor',
|
| 131 |
+
mean=[0.485, 0.456, 0.406],
|
| 132 |
+
std=[0.229, 0.224, 0.225]),
|
| 133 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 134 |
+
dict(
|
| 135 |
+
type='Collect',
|
| 136 |
+
keys=['img', 'target', 'target_weight'],
|
| 137 |
+
meta_keys=[
|
| 138 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 139 |
+
'flip_pairs', 'category_id',
|
| 140 |
+
'skeleton',
|
| 141 |
+
]),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
test_pipeline = valid_pipeline
|
| 145 |
+
|
| 146 |
+
data_root = 'data/mp100'
|
| 147 |
+
data = dict(
|
| 148 |
+
samples_per_gpu=16,
|
| 149 |
+
workers_per_gpu=16,
|
| 150 |
+
# samples_per_gpu=8,
|
| 151 |
+
# workers_per_gpu=8,
|
| 152 |
+
train=dict(
|
| 153 |
+
type='TransformerPoseDataset',
|
| 154 |
+
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
|
| 155 |
+
img_prefix=f'{data_root}/images/',
|
| 156 |
+
# img_prefix=f'{data_root}',
|
| 157 |
+
data_cfg=data_cfg,
|
| 158 |
+
valid_class_ids=None,
|
| 159 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 160 |
+
num_shots=1,
|
| 161 |
+
pipeline=train_pipeline),
|
| 162 |
+
val=dict(
|
| 163 |
+
type='TransformerPoseDataset',
|
| 164 |
+
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
|
| 165 |
+
img_prefix=f'{data_root}/images/',
|
| 166 |
+
# img_prefix=f'{data_root}',
|
| 167 |
+
data_cfg=data_cfg,
|
| 168 |
+
valid_class_ids=None,
|
| 169 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 170 |
+
num_shots=1,
|
| 171 |
+
num_queries=15,
|
| 172 |
+
num_episodes=100,
|
| 173 |
+
pipeline=valid_pipeline),
|
| 174 |
+
test=dict(
|
| 175 |
+
type='TestPoseDataset',
|
| 176 |
+
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
|
| 177 |
+
img_prefix=f'{data_root}/images/',
|
| 178 |
+
# img_prefix=f'{data_root}',
|
| 179 |
+
data_cfg=data_cfg,
|
| 180 |
+
valid_class_ids=None,
|
| 181 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 182 |
+
num_shots=1,
|
| 183 |
+
num_queries=15,
|
| 184 |
+
num_episodes=200,
|
| 185 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 186 |
+
pipeline=test_pipeline),
|
| 187 |
+
)
|
| 188 |
+
vis_backends = [
|
| 189 |
+
dict(type='LocalVisBackend'),
|
| 190 |
+
dict(type='TensorboardVisBackend'),
|
| 191 |
+
]
|
| 192 |
+
visualizer = dict(
|
| 193 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 194 |
+
|
| 195 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-gte/base_split3_config.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=768,
|
| 67 |
+
transformer=dict(
|
| 68 |
+
type='EncoderDecoder',
|
| 69 |
+
d_model=256,
|
| 70 |
+
nhead=8,
|
| 71 |
+
num_encoder_layers=3,
|
| 72 |
+
num_decoder_layers=3,
|
| 73 |
+
dim_feedforward=768,
|
| 74 |
+
dropout=0.1,
|
| 75 |
+
similarity_proj_dim=256,
|
| 76 |
+
dynamic_proj_dim=128,
|
| 77 |
+
activation="relu",
|
| 78 |
+
normalize_before=False,
|
| 79 |
+
return_intermediate_dec=True),
|
| 80 |
+
share_kpt_branch=False,
|
| 81 |
+
num_decoder_layer=3,
|
| 82 |
+
with_heatmap_loss=True,
|
| 83 |
+
|
| 84 |
+
heatmap_loss_weight=2.0,
|
| 85 |
+
support_order_dropout=-1,
|
| 86 |
+
positional_encoding=dict(
|
| 87 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 88 |
+
# training and testing settings
|
| 89 |
+
train_cfg=dict(),
|
| 90 |
+
test_cfg=dict(
|
| 91 |
+
flip_test=False,
|
| 92 |
+
post_process='default',
|
| 93 |
+
shift_heatmap=True,
|
| 94 |
+
modulate_kernel=11))
|
| 95 |
+
|
| 96 |
+
data_cfg = dict(
|
| 97 |
+
image_size=[256, 256],
|
| 98 |
+
heatmap_size=[64, 64],
|
| 99 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 100 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 101 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 102 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 103 |
+
|
| 104 |
+
train_pipeline = [
|
| 105 |
+
dict(type='LoadImageFromFile'),
|
| 106 |
+
dict(
|
| 107 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 108 |
+
scale_factor=0.15),
|
| 109 |
+
dict(type='TopDownAffineFewShot'),
|
| 110 |
+
dict(type='ToTensor'),
|
| 111 |
+
dict(
|
| 112 |
+
type='NormalizeTensor',
|
| 113 |
+
mean=[0.485, 0.456, 0.406],
|
| 114 |
+
std=[0.229, 0.224, 0.225]),
|
| 115 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 116 |
+
dict(
|
| 117 |
+
type='Collect',
|
| 118 |
+
keys=['img', 'target', 'target_weight'],
|
| 119 |
+
meta_keys=[
|
| 120 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 121 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 122 |
+
]),
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
valid_pipeline = [
|
| 126 |
+
dict(type='LoadImageFromFile'),
|
| 127 |
+
dict(type='TopDownAffineFewShot'),
|
| 128 |
+
dict(type='ToTensor'),
|
| 129 |
+
dict(
|
| 130 |
+
type='NormalizeTensor',
|
| 131 |
+
mean=[0.485, 0.456, 0.406],
|
| 132 |
+
std=[0.229, 0.224, 0.225]),
|
| 133 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 134 |
+
dict(
|
| 135 |
+
type='Collect',
|
| 136 |
+
keys=['img', 'target', 'target_weight'],
|
| 137 |
+
meta_keys=[
|
| 138 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 139 |
+
'flip_pairs', 'category_id',
|
| 140 |
+
'skeleton',
|
| 141 |
+
]),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
test_pipeline = valid_pipeline
|
| 145 |
+
|
| 146 |
+
data_root = 'data/mp100'
|
| 147 |
+
data = dict(
|
| 148 |
+
samples_per_gpu=16,
|
| 149 |
+
workers_per_gpu=16,
|
| 150 |
+
# samples_per_gpu=8,
|
| 151 |
+
# workers_per_gpu=8,
|
| 152 |
+
train=dict(
|
| 153 |
+
type='TransformerPoseDataset',
|
| 154 |
+
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
|
| 155 |
+
img_prefix=f'{data_root}/images/',
|
| 156 |
+
# img_prefix=f'{data_root}',
|
| 157 |
+
data_cfg=data_cfg,
|
| 158 |
+
valid_class_ids=None,
|
| 159 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 160 |
+
num_shots=1,
|
| 161 |
+
pipeline=train_pipeline),
|
| 162 |
+
val=dict(
|
| 163 |
+
type='TransformerPoseDataset',
|
| 164 |
+
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
|
| 165 |
+
img_prefix=f'{data_root}/images/',
|
| 166 |
+
# img_prefix=f'{data_root}',
|
| 167 |
+
data_cfg=data_cfg,
|
| 168 |
+
valid_class_ids=None,
|
| 169 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 170 |
+
num_shots=1,
|
| 171 |
+
num_queries=15,
|
| 172 |
+
num_episodes=100,
|
| 173 |
+
pipeline=valid_pipeline),
|
| 174 |
+
test=dict(
|
| 175 |
+
type='TestPoseDataset',
|
| 176 |
+
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
|
| 177 |
+
img_prefix=f'{data_root}/images/',
|
| 178 |
+
# img_prefix=f'{data_root}',
|
| 179 |
+
data_cfg=data_cfg,
|
| 180 |
+
valid_class_ids=None,
|
| 181 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 182 |
+
num_shots=1,
|
| 183 |
+
num_queries=15,
|
| 184 |
+
num_episodes=200,
|
| 185 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 186 |
+
pipeline=test_pipeline),
|
| 187 |
+
)
|
| 188 |
+
vis_backends = [
|
| 189 |
+
dict(type='LocalVisBackend'),
|
| 190 |
+
dict(type='TensorboardVisBackend'),
|
| 191 |
+
]
|
| 192 |
+
visualizer = dict(
|
| 193 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 194 |
+
|
| 195 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-gte/base_split4_config.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=768,
|
| 67 |
+
transformer=dict(
|
| 68 |
+
type='EncoderDecoder',
|
| 69 |
+
d_model=256,
|
| 70 |
+
nhead=8,
|
| 71 |
+
num_encoder_layers=3,
|
| 72 |
+
num_decoder_layers=3,
|
| 73 |
+
dim_feedforward=768,
|
| 74 |
+
dropout=0.1,
|
| 75 |
+
similarity_proj_dim=256,
|
| 76 |
+
dynamic_proj_dim=128,
|
| 77 |
+
activation="relu",
|
| 78 |
+
normalize_before=False,
|
| 79 |
+
return_intermediate_dec=True),
|
| 80 |
+
share_kpt_branch=False,
|
| 81 |
+
num_decoder_layer=3,
|
| 82 |
+
with_heatmap_loss=True,
|
| 83 |
+
|
| 84 |
+
heatmap_loss_weight=2.0,
|
| 85 |
+
support_order_dropout=-1,
|
| 86 |
+
positional_encoding=dict(
|
| 87 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 88 |
+
# training and testing settings
|
| 89 |
+
train_cfg=dict(),
|
| 90 |
+
test_cfg=dict(
|
| 91 |
+
flip_test=False,
|
| 92 |
+
post_process='default',
|
| 93 |
+
shift_heatmap=True,
|
| 94 |
+
modulate_kernel=11))
|
| 95 |
+
|
| 96 |
+
data_cfg = dict(
|
| 97 |
+
image_size=[256, 256],
|
| 98 |
+
heatmap_size=[64, 64],
|
| 99 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 100 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 101 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 102 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 103 |
+
|
| 104 |
+
train_pipeline = [
|
| 105 |
+
dict(type='LoadImageFromFile'),
|
| 106 |
+
dict(
|
| 107 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 108 |
+
scale_factor=0.15),
|
| 109 |
+
dict(type='TopDownAffineFewShot'),
|
| 110 |
+
dict(type='ToTensor'),
|
| 111 |
+
dict(
|
| 112 |
+
type='NormalizeTensor',
|
| 113 |
+
mean=[0.485, 0.456, 0.406],
|
| 114 |
+
std=[0.229, 0.224, 0.225]),
|
| 115 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 116 |
+
dict(
|
| 117 |
+
type='Collect',
|
| 118 |
+
keys=['img', 'target', 'target_weight'],
|
| 119 |
+
meta_keys=[
|
| 120 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 121 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 122 |
+
]),
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
valid_pipeline = [
|
| 126 |
+
dict(type='LoadImageFromFile'),
|
| 127 |
+
dict(type='TopDownAffineFewShot'),
|
| 128 |
+
dict(type='ToTensor'),
|
| 129 |
+
dict(
|
| 130 |
+
type='NormalizeTensor',
|
| 131 |
+
mean=[0.485, 0.456, 0.406],
|
| 132 |
+
std=[0.229, 0.224, 0.225]),
|
| 133 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 134 |
+
dict(
|
| 135 |
+
type='Collect',
|
| 136 |
+
keys=['img', 'target', 'target_weight'],
|
| 137 |
+
meta_keys=[
|
| 138 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 139 |
+
'flip_pairs', 'category_id',
|
| 140 |
+
'skeleton',
|
| 141 |
+
]),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
test_pipeline = valid_pipeline
|
| 145 |
+
|
| 146 |
+
data_root = 'data/mp100'
|
| 147 |
+
data = dict(
|
| 148 |
+
samples_per_gpu=16,
|
| 149 |
+
workers_per_gpu=16,
|
| 150 |
+
# samples_per_gpu=8,
|
| 151 |
+
# workers_per_gpu=8,
|
| 152 |
+
train=dict(
|
| 153 |
+
type='TransformerPoseDataset',
|
| 154 |
+
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
|
| 155 |
+
img_prefix=f'{data_root}/images/',
|
| 156 |
+
# img_prefix=f'{data_root}',
|
| 157 |
+
data_cfg=data_cfg,
|
| 158 |
+
valid_class_ids=None,
|
| 159 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 160 |
+
num_shots=1,
|
| 161 |
+
pipeline=train_pipeline),
|
| 162 |
+
val=dict(
|
| 163 |
+
type='TransformerPoseDataset',
|
| 164 |
+
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
|
| 165 |
+
img_prefix=f'{data_root}/images/',
|
| 166 |
+
# img_prefix=f'{data_root}',
|
| 167 |
+
data_cfg=data_cfg,
|
| 168 |
+
valid_class_ids=None,
|
| 169 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 170 |
+
num_shots=1,
|
| 171 |
+
num_queries=15,
|
| 172 |
+
num_episodes=100,
|
| 173 |
+
pipeline=valid_pipeline),
|
| 174 |
+
test=dict(
|
| 175 |
+
type='TestPoseDataset',
|
| 176 |
+
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
|
| 177 |
+
img_prefix=f'{data_root}/images/',
|
| 178 |
+
# img_prefix=f'{data_root}',
|
| 179 |
+
data_cfg=data_cfg,
|
| 180 |
+
valid_class_ids=None,
|
| 181 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 182 |
+
num_shots=1,
|
| 183 |
+
num_queries=15,
|
| 184 |
+
num_episodes=200,
|
| 185 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 186 |
+
pipeline=test_pipeline),
|
| 187 |
+
)
|
| 188 |
+
vis_backends = [
|
| 189 |
+
dict(type='LocalVisBackend'),
|
| 190 |
+
dict(type='TensorboardVisBackend'),
|
| 191 |
+
]
|
| 192 |
+
visualizer = dict(
|
| 193 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 194 |
+
|
| 195 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-gte/base_split5_config.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=768,
|
| 67 |
+
transformer=dict(
|
| 68 |
+
type='EncoderDecoder',
|
| 69 |
+
d_model=256,
|
| 70 |
+
nhead=8,
|
| 71 |
+
num_encoder_layers=3,
|
| 72 |
+
num_decoder_layers=3,
|
| 73 |
+
dim_feedforward=768,
|
| 74 |
+
dropout=0.1,
|
| 75 |
+
similarity_proj_dim=256,
|
| 76 |
+
dynamic_proj_dim=128,
|
| 77 |
+
activation="relu",
|
| 78 |
+
normalize_before=False,
|
| 79 |
+
return_intermediate_dec=True),
|
| 80 |
+
share_kpt_branch=False,
|
| 81 |
+
num_decoder_layer=3,
|
| 82 |
+
with_heatmap_loss=True,
|
| 83 |
+
|
| 84 |
+
heatmap_loss_weight=2.0,
|
| 85 |
+
support_order_dropout=-1,
|
| 86 |
+
positional_encoding=dict(
|
| 87 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 88 |
+
# training and testing settings
|
| 89 |
+
train_cfg=dict(),
|
| 90 |
+
test_cfg=dict(
|
| 91 |
+
flip_test=False,
|
| 92 |
+
post_process='default',
|
| 93 |
+
shift_heatmap=True,
|
| 94 |
+
modulate_kernel=11))
|
| 95 |
+
|
| 96 |
+
data_cfg = dict(
|
| 97 |
+
image_size=[256, 256],
|
| 98 |
+
heatmap_size=[64, 64],
|
| 99 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 100 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 101 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 102 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 103 |
+
|
| 104 |
+
train_pipeline = [
|
| 105 |
+
dict(type='LoadImageFromFile'),
|
| 106 |
+
dict(
|
| 107 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 108 |
+
scale_factor=0.15),
|
| 109 |
+
dict(type='TopDownAffineFewShot'),
|
| 110 |
+
dict(type='ToTensor'),
|
| 111 |
+
dict(
|
| 112 |
+
type='NormalizeTensor',
|
| 113 |
+
mean=[0.485, 0.456, 0.406],
|
| 114 |
+
std=[0.229, 0.224, 0.225]),
|
| 115 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 116 |
+
dict(
|
| 117 |
+
type='Collect',
|
| 118 |
+
keys=['img', 'target', 'target_weight'],
|
| 119 |
+
meta_keys=[
|
| 120 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 121 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 122 |
+
]),
|
| 123 |
+
]
|
| 124 |
+
|
| 125 |
+
valid_pipeline = [
|
| 126 |
+
dict(type='LoadImageFromFile'),
|
| 127 |
+
dict(type='TopDownAffineFewShot'),
|
| 128 |
+
dict(type='ToTensor'),
|
| 129 |
+
dict(
|
| 130 |
+
type='NormalizeTensor',
|
| 131 |
+
mean=[0.485, 0.456, 0.406],
|
| 132 |
+
std=[0.229, 0.224, 0.225]),
|
| 133 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 134 |
+
dict(
|
| 135 |
+
type='Collect',
|
| 136 |
+
keys=['img', 'target', 'target_weight'],
|
| 137 |
+
meta_keys=[
|
| 138 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 139 |
+
'flip_pairs', 'category_id',
|
| 140 |
+
'skeleton',
|
| 141 |
+
]),
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
test_pipeline = valid_pipeline
|
| 145 |
+
|
| 146 |
+
data_root = 'data/mp100'
|
| 147 |
+
data = dict(
|
| 148 |
+
samples_per_gpu=16,
|
| 149 |
+
workers_per_gpu=16,
|
| 150 |
+
# samples_per_gpu=8,
|
| 151 |
+
# workers_per_gpu=8,
|
| 152 |
+
train=dict(
|
| 153 |
+
type='TransformerPoseDataset',
|
| 154 |
+
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
|
| 155 |
+
img_prefix=f'{data_root}/images/',
|
| 156 |
+
# img_prefix=f'{data_root}',
|
| 157 |
+
data_cfg=data_cfg,
|
| 158 |
+
valid_class_ids=None,
|
| 159 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 160 |
+
num_shots=1,
|
| 161 |
+
pipeline=train_pipeline),
|
| 162 |
+
val=dict(
|
| 163 |
+
type='TransformerPoseDataset',
|
| 164 |
+
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
|
| 165 |
+
img_prefix=f'{data_root}/images/',
|
| 166 |
+
# img_prefix=f'{data_root}',
|
| 167 |
+
data_cfg=data_cfg,
|
| 168 |
+
valid_class_ids=None,
|
| 169 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 170 |
+
num_shots=1,
|
| 171 |
+
num_queries=15,
|
| 172 |
+
num_episodes=100,
|
| 173 |
+
pipeline=valid_pipeline),
|
| 174 |
+
test=dict(
|
| 175 |
+
type='TestPoseDataset',
|
| 176 |
+
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
|
| 177 |
+
img_prefix=f'{data_root}/images/',
|
| 178 |
+
# img_prefix=f'{data_root}',
|
| 179 |
+
data_cfg=data_cfg,
|
| 180 |
+
valid_class_ids=None,
|
| 181 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 182 |
+
num_shots=1,
|
| 183 |
+
num_queries=15,
|
| 184 |
+
num_episodes=200,
|
| 185 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 186 |
+
pipeline=test_pipeline),
|
| 187 |
+
)
|
| 188 |
+
vis_backends = [
|
| 189 |
+
dict(type='LocalVisBackend'),
|
| 190 |
+
dict(type='TensorboardVisBackend'),
|
| 191 |
+
]
|
| 192 |
+
visualizer = dict(
|
| 193 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 194 |
+
|
| 195 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-gte/graph_split1_config.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
# total_epochs = 1
|
| 28 |
+
log_config = dict(
|
| 29 |
+
interval=50,
|
| 30 |
+
hooks=[
|
| 31 |
+
dict(type='TextLoggerHook'),
|
| 32 |
+
dict(type='TensorboardLoggerHook')
|
| 33 |
+
])
|
| 34 |
+
|
| 35 |
+
channel_cfg = dict(
|
| 36 |
+
num_output_channels=1,
|
| 37 |
+
dataset_joints=1,
|
| 38 |
+
dataset_channel=[
|
| 39 |
+
[
|
| 40 |
+
0,
|
| 41 |
+
],
|
| 42 |
+
],
|
| 43 |
+
inference_channel=[
|
| 44 |
+
0,
|
| 45 |
+
],
|
| 46 |
+
max_kpt_num=100)
|
| 47 |
+
|
| 48 |
+
# model settings
|
| 49 |
+
model = dict(
|
| 50 |
+
type='PoseAnythingModel',
|
| 51 |
+
pretrained="swinv2_base",
|
| 52 |
+
#'pretrained/swinv2_small_1k_500k.pth',
|
| 53 |
+
text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
|
| 54 |
+
finetune_text_pretrained=False,
|
| 55 |
+
encoder_config=dict(
|
| 56 |
+
type='SwinTransformerV2',
|
| 57 |
+
embed_dim=96,
|
| 58 |
+
depths=[2, 2, 18, 2],
|
| 59 |
+
num_heads=[3, 6, 12, 24],
|
| 60 |
+
window_size=16,
|
| 61 |
+
drop_path_rate=0.3,
|
| 62 |
+
img_size=256,
|
| 63 |
+
upsample="bilinear"
|
| 64 |
+
),
|
| 65 |
+
keypoint_head=dict(
|
| 66 |
+
type='PoseHead',
|
| 67 |
+
img_in_channels=768,
|
| 68 |
+
text_in_channels=768,
|
| 69 |
+
# text_in_channels=512,
|
| 70 |
+
transformer=dict(
|
| 71 |
+
type='EncoderDecoder',
|
| 72 |
+
d_model=256,
|
| 73 |
+
nhead=8,
|
| 74 |
+
num_encoder_layers=3,
|
| 75 |
+
num_decoder_layers=3,
|
| 76 |
+
graph_decoder='pre',
|
| 77 |
+
dim_feedforward=768,
|
| 78 |
+
dropout=0.1,
|
| 79 |
+
similarity_proj_dim=256,
|
| 80 |
+
dynamic_proj_dim=128,
|
| 81 |
+
activation="relu",
|
| 82 |
+
normalize_before=False,
|
| 83 |
+
return_intermediate_dec=True),
|
| 84 |
+
share_kpt_branch=False,
|
| 85 |
+
num_decoder_layer=3,
|
| 86 |
+
with_heatmap_loss=True,
|
| 87 |
+
|
| 88 |
+
heatmap_loss_weight=2.0,
|
| 89 |
+
support_order_dropout=-1,
|
| 90 |
+
positional_encoding=dict(
|
| 91 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 92 |
+
# training and testing settings
|
| 93 |
+
train_cfg=dict(),
|
| 94 |
+
test_cfg=dict(
|
| 95 |
+
flip_test=False,
|
| 96 |
+
post_process='default',
|
| 97 |
+
shift_heatmap=True,
|
| 98 |
+
modulate_kernel=11))
|
| 99 |
+
|
| 100 |
+
data_cfg = dict(
|
| 101 |
+
image_size=[256, 256],
|
| 102 |
+
heatmap_size=[64, 64],
|
| 103 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 104 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 105 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 106 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 107 |
+
|
| 108 |
+
train_pipeline = [
|
| 109 |
+
dict(type='LoadImageFromFile'),
|
| 110 |
+
dict(
|
| 111 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 112 |
+
scale_factor=0.15),
|
| 113 |
+
dict(type='TopDownAffineFewShot'),
|
| 114 |
+
dict(type='ToTensor'),
|
| 115 |
+
dict(
|
| 116 |
+
type='NormalizeTensor',
|
| 117 |
+
mean=[0.485, 0.456, 0.406],
|
| 118 |
+
std=[0.229, 0.224, 0.225]),
|
| 119 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 120 |
+
dict(
|
| 121 |
+
type='Collect',
|
| 122 |
+
keys=['img', 'target', 'target_weight'],
|
| 123 |
+
meta_keys=[
|
| 124 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 125 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 126 |
+
]),
|
| 127 |
+
]
|
| 128 |
+
|
| 129 |
+
valid_pipeline = [
|
| 130 |
+
dict(type='LoadImageFromFile'),
|
| 131 |
+
dict(type='TopDownAffineFewShot'),
|
| 132 |
+
dict(type='ToTensor'),
|
| 133 |
+
dict(
|
| 134 |
+
type='NormalizeTensor',
|
| 135 |
+
mean=[0.485, 0.456, 0.406],
|
| 136 |
+
std=[0.229, 0.224, 0.225]),
|
| 137 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 138 |
+
dict(
|
| 139 |
+
type='Collect',
|
| 140 |
+
keys=['img', 'target', 'target_weight'],
|
| 141 |
+
meta_keys=[
|
| 142 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 143 |
+
'flip_pairs', 'category_id',
|
| 144 |
+
'skeleton',
|
| 145 |
+
]),
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
test_pipeline = valid_pipeline
|
| 149 |
+
|
| 150 |
+
data_root = 'data/mp100'
|
| 151 |
+
data = dict(
|
| 152 |
+
samples_per_gpu=16,
|
| 153 |
+
workers_per_gpu=16,
|
| 154 |
+
# samples_per_gpu=8,
|
| 155 |
+
# workers_per_gpu=8,
|
| 156 |
+
train=dict(
|
| 157 |
+
type='TransformerPoseDataset',
|
| 158 |
+
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
|
| 159 |
+
img_prefix=f'{data_root}/images/',
|
| 160 |
+
# img_prefix=f'{data_root}',
|
| 161 |
+
data_cfg=data_cfg,
|
| 162 |
+
valid_class_ids=None,
|
| 163 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 164 |
+
num_shots=1,
|
| 165 |
+
pipeline=train_pipeline),
|
| 166 |
+
val=dict(
|
| 167 |
+
type='TransformerPoseDataset',
|
| 168 |
+
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
|
| 169 |
+
img_prefix=f'{data_root}/images/',
|
| 170 |
+
# img_prefix=f'{data_root}',
|
| 171 |
+
data_cfg=data_cfg,
|
| 172 |
+
valid_class_ids=None,
|
| 173 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 174 |
+
num_shots=1,
|
| 175 |
+
num_queries=15,
|
| 176 |
+
num_episodes=100,
|
| 177 |
+
pipeline=valid_pipeline),
|
| 178 |
+
test=dict(
|
| 179 |
+
type='TestPoseDataset',
|
| 180 |
+
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
|
| 181 |
+
img_prefix=f'{data_root}/images/',
|
| 182 |
+
# img_prefix=f'{data_root}',
|
| 183 |
+
data_cfg=data_cfg,
|
| 184 |
+
valid_class_ids=None,
|
| 185 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 186 |
+
num_shots=1,
|
| 187 |
+
num_queries=15,
|
| 188 |
+
num_episodes=200,
|
| 189 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 190 |
+
pipeline=test_pipeline),
|
| 191 |
+
)
|
| 192 |
+
vis_backends = [
|
| 193 |
+
dict(type='LocalVisBackend'),
|
| 194 |
+
dict(type='TensorboardVisBackend'),
|
| 195 |
+
]
|
| 196 |
+
visualizer = dict(
|
| 197 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 198 |
+
|
| 199 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-gte/graph_split2_config.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=768,
|
| 67 |
+
# text_in_channels=512,
|
| 68 |
+
transformer=dict(
|
| 69 |
+
type='EncoderDecoder',
|
| 70 |
+
d_model=256,
|
| 71 |
+
nhead=8,
|
| 72 |
+
num_encoder_layers=3,
|
| 73 |
+
num_decoder_layers=3,
|
| 74 |
+
graph_decoder='pre',
|
| 75 |
+
dim_feedforward=768,
|
| 76 |
+
dropout=0.1,
|
| 77 |
+
similarity_proj_dim=256,
|
| 78 |
+
dynamic_proj_dim=128,
|
| 79 |
+
activation="relu",
|
| 80 |
+
normalize_before=False,
|
| 81 |
+
return_intermediate_dec=True),
|
| 82 |
+
share_kpt_branch=False,
|
| 83 |
+
num_decoder_layer=3,
|
| 84 |
+
with_heatmap_loss=True,
|
| 85 |
+
|
| 86 |
+
heatmap_loss_weight=2.0,
|
| 87 |
+
support_order_dropout=-1,
|
| 88 |
+
positional_encoding=dict(
|
| 89 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 90 |
+
# training and testing settings
|
| 91 |
+
train_cfg=dict(),
|
| 92 |
+
test_cfg=dict(
|
| 93 |
+
flip_test=False,
|
| 94 |
+
post_process='default',
|
| 95 |
+
shift_heatmap=True,
|
| 96 |
+
modulate_kernel=11))
|
| 97 |
+
|
| 98 |
+
data_cfg = dict(
|
| 99 |
+
image_size=[256, 256],
|
| 100 |
+
heatmap_size=[64, 64],
|
| 101 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 102 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 103 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 104 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 105 |
+
|
| 106 |
+
train_pipeline = [
|
| 107 |
+
dict(type='LoadImageFromFile'),
|
| 108 |
+
dict(
|
| 109 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 110 |
+
scale_factor=0.15),
|
| 111 |
+
dict(type='TopDownAffineFewShot'),
|
| 112 |
+
dict(type='ToTensor'),
|
| 113 |
+
dict(
|
| 114 |
+
type='NormalizeTensor',
|
| 115 |
+
mean=[0.485, 0.456, 0.406],
|
| 116 |
+
std=[0.229, 0.224, 0.225]),
|
| 117 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 118 |
+
dict(
|
| 119 |
+
type='Collect',
|
| 120 |
+
keys=['img', 'target', 'target_weight'],
|
| 121 |
+
meta_keys=[
|
| 122 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 123 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 124 |
+
]),
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
valid_pipeline = [
|
| 128 |
+
dict(type='LoadImageFromFile'),
|
| 129 |
+
dict(type='TopDownAffineFewShot'),
|
| 130 |
+
dict(type='ToTensor'),
|
| 131 |
+
dict(
|
| 132 |
+
type='NormalizeTensor',
|
| 133 |
+
mean=[0.485, 0.456, 0.406],
|
| 134 |
+
std=[0.229, 0.224, 0.225]),
|
| 135 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 136 |
+
dict(
|
| 137 |
+
type='Collect',
|
| 138 |
+
keys=['img', 'target', 'target_weight'],
|
| 139 |
+
meta_keys=[
|
| 140 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 141 |
+
'flip_pairs', 'category_id',
|
| 142 |
+
'skeleton',
|
| 143 |
+
]),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
test_pipeline = valid_pipeline
|
| 147 |
+
|
| 148 |
+
data_root = 'data/mp100'
|
| 149 |
+
data = dict(
|
| 150 |
+
samples_per_gpu=16,
|
| 151 |
+
workers_per_gpu=16,
|
| 152 |
+
# samples_per_gpu=8,
|
| 153 |
+
# workers_per_gpu=8,
|
| 154 |
+
train=dict(
|
| 155 |
+
type='TransformerPoseDataset',
|
| 156 |
+
ann_file=f'{data_root}/annotations/mp100_split2_train.json',
|
| 157 |
+
img_prefix=f'{data_root}/images/',
|
| 158 |
+
# img_prefix=f'{data_root}',
|
| 159 |
+
data_cfg=data_cfg,
|
| 160 |
+
valid_class_ids=None,
|
| 161 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 162 |
+
num_shots=1,
|
| 163 |
+
pipeline=train_pipeline),
|
| 164 |
+
val=dict(
|
| 165 |
+
type='TransformerPoseDataset',
|
| 166 |
+
ann_file=f'{data_root}/annotations/mp100_split2_val.json',
|
| 167 |
+
img_prefix=f'{data_root}/images/',
|
| 168 |
+
# img_prefix=f'{data_root}',
|
| 169 |
+
data_cfg=data_cfg,
|
| 170 |
+
valid_class_ids=None,
|
| 171 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 172 |
+
num_shots=1,
|
| 173 |
+
num_queries=15,
|
| 174 |
+
num_episodes=100,
|
| 175 |
+
pipeline=valid_pipeline),
|
| 176 |
+
test=dict(
|
| 177 |
+
type='TestPoseDataset',
|
| 178 |
+
ann_file=f'{data_root}/annotations/mp100_split2_test.json',
|
| 179 |
+
img_prefix=f'{data_root}/images/',
|
| 180 |
+
# img_prefix=f'{data_root}',
|
| 181 |
+
data_cfg=data_cfg,
|
| 182 |
+
valid_class_ids=None,
|
| 183 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 184 |
+
num_shots=1,
|
| 185 |
+
num_queries=15,
|
| 186 |
+
num_episodes=200,
|
| 187 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 188 |
+
pipeline=test_pipeline),
|
| 189 |
+
)
|
| 190 |
+
vis_backends = [
|
| 191 |
+
dict(type='LocalVisBackend'),
|
| 192 |
+
dict(type='TensorboardVisBackend'),
|
| 193 |
+
]
|
| 194 |
+
visualizer = dict(
|
| 195 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 196 |
+
|
| 197 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-gte/graph_split3_config.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=768,
|
| 67 |
+
# text_in_channels=512,
|
| 68 |
+
transformer=dict(
|
| 69 |
+
type='EncoderDecoder',
|
| 70 |
+
d_model=256,
|
| 71 |
+
nhead=8,
|
| 72 |
+
num_encoder_layers=3,
|
| 73 |
+
num_decoder_layers=3,
|
| 74 |
+
graph_decoder='pre',
|
| 75 |
+
dim_feedforward=768,
|
| 76 |
+
dropout=0.1,
|
| 77 |
+
similarity_proj_dim=256,
|
| 78 |
+
dynamic_proj_dim=128,
|
| 79 |
+
activation="relu",
|
| 80 |
+
normalize_before=False,
|
| 81 |
+
return_intermediate_dec=True),
|
| 82 |
+
share_kpt_branch=False,
|
| 83 |
+
num_decoder_layer=3,
|
| 84 |
+
with_heatmap_loss=True,
|
| 85 |
+
|
| 86 |
+
heatmap_loss_weight=2.0,
|
| 87 |
+
support_order_dropout=-1,
|
| 88 |
+
positional_encoding=dict(
|
| 89 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 90 |
+
# training and testing settings
|
| 91 |
+
train_cfg=dict(),
|
| 92 |
+
test_cfg=dict(
|
| 93 |
+
flip_test=False,
|
| 94 |
+
post_process='default',
|
| 95 |
+
shift_heatmap=True,
|
| 96 |
+
modulate_kernel=11))
|
| 97 |
+
|
| 98 |
+
data_cfg = dict(
|
| 99 |
+
image_size=[256, 256],
|
| 100 |
+
heatmap_size=[64, 64],
|
| 101 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 102 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 103 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 104 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 105 |
+
|
| 106 |
+
train_pipeline = [
|
| 107 |
+
dict(type='LoadImageFromFile'),
|
| 108 |
+
dict(
|
| 109 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 110 |
+
scale_factor=0.15),
|
| 111 |
+
dict(type='TopDownAffineFewShot'),
|
| 112 |
+
dict(type='ToTensor'),
|
| 113 |
+
dict(
|
| 114 |
+
type='NormalizeTensor',
|
| 115 |
+
mean=[0.485, 0.456, 0.406],
|
| 116 |
+
std=[0.229, 0.224, 0.225]),
|
| 117 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 118 |
+
dict(
|
| 119 |
+
type='Collect',
|
| 120 |
+
keys=['img', 'target', 'target_weight'],
|
| 121 |
+
meta_keys=[
|
| 122 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 123 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 124 |
+
]),
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
valid_pipeline = [
|
| 128 |
+
dict(type='LoadImageFromFile'),
|
| 129 |
+
dict(type='TopDownAffineFewShot'),
|
| 130 |
+
dict(type='ToTensor'),
|
| 131 |
+
dict(
|
| 132 |
+
type='NormalizeTensor',
|
| 133 |
+
mean=[0.485, 0.456, 0.406],
|
| 134 |
+
std=[0.229, 0.224, 0.225]),
|
| 135 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 136 |
+
dict(
|
| 137 |
+
type='Collect',
|
| 138 |
+
keys=['img', 'target', 'target_weight'],
|
| 139 |
+
meta_keys=[
|
| 140 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 141 |
+
'flip_pairs', 'category_id',
|
| 142 |
+
'skeleton',
|
| 143 |
+
]),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
test_pipeline = valid_pipeline
|
| 147 |
+
|
| 148 |
+
data_root = 'data/mp100'
|
| 149 |
+
data = dict(
|
| 150 |
+
samples_per_gpu=16,
|
| 151 |
+
workers_per_gpu=16,
|
| 152 |
+
# samples_per_gpu=8,
|
| 153 |
+
# workers_per_gpu=8,
|
| 154 |
+
train=dict(
|
| 155 |
+
type='TransformerPoseDataset',
|
| 156 |
+
ann_file=f'{data_root}/annotations/mp100_split3_train.json',
|
| 157 |
+
img_prefix=f'{data_root}/images/',
|
| 158 |
+
# img_prefix=f'{data_root}',
|
| 159 |
+
data_cfg=data_cfg,
|
| 160 |
+
valid_class_ids=None,
|
| 161 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 162 |
+
num_shots=1,
|
| 163 |
+
pipeline=train_pipeline),
|
| 164 |
+
val=dict(
|
| 165 |
+
type='TransformerPoseDataset',
|
| 166 |
+
ann_file=f'{data_root}/annotations/mp100_split3_val.json',
|
| 167 |
+
img_prefix=f'{data_root}/images/',
|
| 168 |
+
# img_prefix=f'{data_root}',
|
| 169 |
+
data_cfg=data_cfg,
|
| 170 |
+
valid_class_ids=None,
|
| 171 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 172 |
+
num_shots=1,
|
| 173 |
+
num_queries=15,
|
| 174 |
+
num_episodes=100,
|
| 175 |
+
pipeline=valid_pipeline),
|
| 176 |
+
test=dict(
|
| 177 |
+
type='TestPoseDataset',
|
| 178 |
+
ann_file=f'{data_root}/annotations/mp100_split3_test.json',
|
| 179 |
+
img_prefix=f'{data_root}/images/',
|
| 180 |
+
# img_prefix=f'{data_root}',
|
| 181 |
+
data_cfg=data_cfg,
|
| 182 |
+
valid_class_ids=None,
|
| 183 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 184 |
+
num_shots=1,
|
| 185 |
+
num_queries=15,
|
| 186 |
+
num_episodes=200,
|
| 187 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 188 |
+
pipeline=test_pipeline),
|
| 189 |
+
)
|
| 190 |
+
vis_backends = [
|
| 191 |
+
dict(type='LocalVisBackend'),
|
| 192 |
+
dict(type='TensorboardVisBackend'),
|
| 193 |
+
]
|
| 194 |
+
visualizer = dict(
|
| 195 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 196 |
+
|
| 197 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-gte/graph_split4_config.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=768,
|
| 67 |
+
# text_in_channels=512,
|
| 68 |
+
transformer=dict(
|
| 69 |
+
type='EncoderDecoder',
|
| 70 |
+
d_model=256,
|
| 71 |
+
nhead=8,
|
| 72 |
+
num_encoder_layers=3,
|
| 73 |
+
num_decoder_layers=3,
|
| 74 |
+
graph_decoder='pre',
|
| 75 |
+
dim_feedforward=768,
|
| 76 |
+
dropout=0.1,
|
| 77 |
+
similarity_proj_dim=256,
|
| 78 |
+
dynamic_proj_dim=128,
|
| 79 |
+
activation="relu",
|
| 80 |
+
normalize_before=False,
|
| 81 |
+
return_intermediate_dec=True),
|
| 82 |
+
share_kpt_branch=False,
|
| 83 |
+
num_decoder_layer=3,
|
| 84 |
+
with_heatmap_loss=True,
|
| 85 |
+
|
| 86 |
+
heatmap_loss_weight=2.0,
|
| 87 |
+
support_order_dropout=-1,
|
| 88 |
+
positional_encoding=dict(
|
| 89 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 90 |
+
# training and testing settings
|
| 91 |
+
train_cfg=dict(),
|
| 92 |
+
test_cfg=dict(
|
| 93 |
+
flip_test=False,
|
| 94 |
+
post_process='default',
|
| 95 |
+
shift_heatmap=True,
|
| 96 |
+
modulate_kernel=11))
|
| 97 |
+
|
| 98 |
+
data_cfg = dict(
|
| 99 |
+
image_size=[256, 256],
|
| 100 |
+
heatmap_size=[64, 64],
|
| 101 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 102 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 103 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 104 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 105 |
+
|
| 106 |
+
train_pipeline = [
|
| 107 |
+
dict(type='LoadImageFromFile'),
|
| 108 |
+
dict(
|
| 109 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 110 |
+
scale_factor=0.15),
|
| 111 |
+
dict(type='TopDownAffineFewShot'),
|
| 112 |
+
dict(type='ToTensor'),
|
| 113 |
+
dict(
|
| 114 |
+
type='NormalizeTensor',
|
| 115 |
+
mean=[0.485, 0.456, 0.406],
|
| 116 |
+
std=[0.229, 0.224, 0.225]),
|
| 117 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 118 |
+
dict(
|
| 119 |
+
type='Collect',
|
| 120 |
+
keys=['img', 'target', 'target_weight'],
|
| 121 |
+
meta_keys=[
|
| 122 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 123 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 124 |
+
]),
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
valid_pipeline = [
|
| 128 |
+
dict(type='LoadImageFromFile'),
|
| 129 |
+
dict(type='TopDownAffineFewShot'),
|
| 130 |
+
dict(type='ToTensor'),
|
| 131 |
+
dict(
|
| 132 |
+
type='NormalizeTensor',
|
| 133 |
+
mean=[0.485, 0.456, 0.406],
|
| 134 |
+
std=[0.229, 0.224, 0.225]),
|
| 135 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 136 |
+
dict(
|
| 137 |
+
type='Collect',
|
| 138 |
+
keys=['img', 'target', 'target_weight'],
|
| 139 |
+
meta_keys=[
|
| 140 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 141 |
+
'flip_pairs', 'category_id',
|
| 142 |
+
'skeleton',
|
| 143 |
+
]),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
test_pipeline = valid_pipeline
|
| 147 |
+
|
| 148 |
+
data_root = 'data/mp100'
|
| 149 |
+
data = dict(
|
| 150 |
+
samples_per_gpu=16,
|
| 151 |
+
workers_per_gpu=16,
|
| 152 |
+
# samples_per_gpu=8,
|
| 153 |
+
# workers_per_gpu=8,
|
| 154 |
+
train=dict(
|
| 155 |
+
type='TransformerPoseDataset',
|
| 156 |
+
ann_file=f'{data_root}/annotations/mp100_split4_train.json',
|
| 157 |
+
img_prefix=f'{data_root}/images/',
|
| 158 |
+
# img_prefix=f'{data_root}',
|
| 159 |
+
data_cfg=data_cfg,
|
| 160 |
+
valid_class_ids=None,
|
| 161 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 162 |
+
num_shots=1,
|
| 163 |
+
pipeline=train_pipeline),
|
| 164 |
+
val=dict(
|
| 165 |
+
type='TransformerPoseDataset',
|
| 166 |
+
ann_file=f'{data_root}/annotations/mp100_split4_val.json',
|
| 167 |
+
img_prefix=f'{data_root}/images/',
|
| 168 |
+
# img_prefix=f'{data_root}',
|
| 169 |
+
data_cfg=data_cfg,
|
| 170 |
+
valid_class_ids=None,
|
| 171 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 172 |
+
num_shots=1,
|
| 173 |
+
num_queries=15,
|
| 174 |
+
num_episodes=100,
|
| 175 |
+
pipeline=valid_pipeline),
|
| 176 |
+
test=dict(
|
| 177 |
+
type='TestPoseDataset',
|
| 178 |
+
ann_file=f'{data_root}/annotations/mp100_split4_test.json',
|
| 179 |
+
img_prefix=f'{data_root}/images/',
|
| 180 |
+
# img_prefix=f'{data_root}',
|
| 181 |
+
data_cfg=data_cfg,
|
| 182 |
+
valid_class_ids=None,
|
| 183 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 184 |
+
num_shots=1,
|
| 185 |
+
num_queries=15,
|
| 186 |
+
num_episodes=200,
|
| 187 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 188 |
+
pipeline=test_pipeline),
|
| 189 |
+
)
|
| 190 |
+
vis_backends = [
|
| 191 |
+
dict(type='LocalVisBackend'),
|
| 192 |
+
dict(type='TensorboardVisBackend'),
|
| 193 |
+
]
|
| 194 |
+
visualizer = dict(
|
| 195 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 196 |
+
|
| 197 |
+
shuffle_cfg = dict(interval=1)
|
configs/1shot-swin-gte/graph_split5_config.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='pretrained/swinv2_small_1k_500k.pth',
|
| 51 |
+
text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
|
| 52 |
+
finetune_text_pretrained=False,
|
| 53 |
+
encoder_config=dict(
|
| 54 |
+
type='SwinTransformerV2',
|
| 55 |
+
embed_dim=96,
|
| 56 |
+
depths=[2, 2, 18, 2],
|
| 57 |
+
num_heads=[3, 6, 12, 24],
|
| 58 |
+
window_size=16,
|
| 59 |
+
drop_path_rate=0.3,
|
| 60 |
+
img_size=256,
|
| 61 |
+
upsample="bilinear"
|
| 62 |
+
),
|
| 63 |
+
keypoint_head=dict(
|
| 64 |
+
type='PoseHead',
|
| 65 |
+
img_in_channels=768,
|
| 66 |
+
text_in_channels=768,
|
| 67 |
+
# text_in_channels=512,
|
| 68 |
+
transformer=dict(
|
| 69 |
+
type='EncoderDecoder',
|
| 70 |
+
d_model=256,
|
| 71 |
+
nhead=8,
|
| 72 |
+
num_encoder_layers=3,
|
| 73 |
+
num_decoder_layers=3,
|
| 74 |
+
graph_decoder='pre',
|
| 75 |
+
dim_feedforward=768,
|
| 76 |
+
dropout=0.1,
|
| 77 |
+
similarity_proj_dim=256,
|
| 78 |
+
dynamic_proj_dim=128,
|
| 79 |
+
activation="relu",
|
| 80 |
+
normalize_before=False,
|
| 81 |
+
return_intermediate_dec=True),
|
| 82 |
+
share_kpt_branch=False,
|
| 83 |
+
num_decoder_layer=3,
|
| 84 |
+
with_heatmap_loss=True,
|
| 85 |
+
|
| 86 |
+
heatmap_loss_weight=2.0,
|
| 87 |
+
support_order_dropout=-1,
|
| 88 |
+
positional_encoding=dict(
|
| 89 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 90 |
+
# training and testing settings
|
| 91 |
+
train_cfg=dict(),
|
| 92 |
+
test_cfg=dict(
|
| 93 |
+
flip_test=False,
|
| 94 |
+
post_process='default',
|
| 95 |
+
shift_heatmap=True,
|
| 96 |
+
modulate_kernel=11))
|
| 97 |
+
|
| 98 |
+
data_cfg = dict(
|
| 99 |
+
image_size=[256, 256],
|
| 100 |
+
heatmap_size=[64, 64],
|
| 101 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 102 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 103 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 104 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 105 |
+
|
| 106 |
+
train_pipeline = [
|
| 107 |
+
dict(type='LoadImageFromFile'),
|
| 108 |
+
dict(
|
| 109 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 110 |
+
scale_factor=0.15),
|
| 111 |
+
dict(type='TopDownAffineFewShot'),
|
| 112 |
+
dict(type='ToTensor'),
|
| 113 |
+
dict(
|
| 114 |
+
type='NormalizeTensor',
|
| 115 |
+
mean=[0.485, 0.456, 0.406],
|
| 116 |
+
std=[0.229, 0.224, 0.225]),
|
| 117 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 118 |
+
dict(
|
| 119 |
+
type='Collect',
|
| 120 |
+
keys=['img', 'target', 'target_weight'],
|
| 121 |
+
meta_keys=[
|
| 122 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 123 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 124 |
+
]),
|
| 125 |
+
]
|
| 126 |
+
|
| 127 |
+
valid_pipeline = [
|
| 128 |
+
dict(type='LoadImageFromFile'),
|
| 129 |
+
dict(type='TopDownAffineFewShot'),
|
| 130 |
+
dict(type='ToTensor'),
|
| 131 |
+
dict(
|
| 132 |
+
type='NormalizeTensor',
|
| 133 |
+
mean=[0.485, 0.456, 0.406],
|
| 134 |
+
std=[0.229, 0.224, 0.225]),
|
| 135 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 136 |
+
dict(
|
| 137 |
+
type='Collect',
|
| 138 |
+
keys=['img', 'target', 'target_weight'],
|
| 139 |
+
meta_keys=[
|
| 140 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 141 |
+
'flip_pairs', 'category_id',
|
| 142 |
+
'skeleton',
|
| 143 |
+
]),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
test_pipeline = valid_pipeline
|
| 147 |
+
|
| 148 |
+
data_root = 'data/mp100'
|
| 149 |
+
data = dict(
|
| 150 |
+
samples_per_gpu=16,
|
| 151 |
+
workers_per_gpu=16,
|
| 152 |
+
# samples_per_gpu=8,
|
| 153 |
+
# workers_per_gpu=8,
|
| 154 |
+
train=dict(
|
| 155 |
+
type='TransformerPoseDataset',
|
| 156 |
+
ann_file=f'{data_root}/annotations/mp100_split5_train.json',
|
| 157 |
+
img_prefix=f'{data_root}/images/',
|
| 158 |
+
# img_prefix=f'{data_root}',
|
| 159 |
+
data_cfg=data_cfg,
|
| 160 |
+
valid_class_ids=None,
|
| 161 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 162 |
+
num_shots=1,
|
| 163 |
+
pipeline=train_pipeline),
|
| 164 |
+
val=dict(
|
| 165 |
+
type='TransformerPoseDataset',
|
| 166 |
+
ann_file=f'{data_root}/annotations/mp100_split5_val.json',
|
| 167 |
+
img_prefix=f'{data_root}/images/',
|
| 168 |
+
# img_prefix=f'{data_root}',
|
| 169 |
+
data_cfg=data_cfg,
|
| 170 |
+
valid_class_ids=None,
|
| 171 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 172 |
+
num_shots=1,
|
| 173 |
+
num_queries=15,
|
| 174 |
+
num_episodes=100,
|
| 175 |
+
pipeline=valid_pipeline),
|
| 176 |
+
test=dict(
|
| 177 |
+
type='TestPoseDataset',
|
| 178 |
+
ann_file=f'{data_root}/annotations/mp100_split5_test.json',
|
| 179 |
+
img_prefix=f'{data_root}/images/',
|
| 180 |
+
# img_prefix=f'{data_root}',
|
| 181 |
+
data_cfg=data_cfg,
|
| 182 |
+
valid_class_ids=None,
|
| 183 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 184 |
+
num_shots=1,
|
| 185 |
+
num_queries=15,
|
| 186 |
+
num_episodes=200,
|
| 187 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 188 |
+
pipeline=test_pipeline),
|
| 189 |
+
)
|
| 190 |
+
vis_backends = [
|
| 191 |
+
dict(type='LocalVisBackend'),
|
| 192 |
+
dict(type='TensorboardVisBackend'),
|
| 193 |
+
]
|
| 194 |
+
visualizer = dict(
|
| 195 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 196 |
+
|
| 197 |
+
shuffle_cfg = dict(interval=1)
|
configs/_base_/datasets/ap10k.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dataset_info = dict(
|
| 2 |
+
dataset_name='ap10k',
|
| 3 |
+
paper_info=dict(
|
| 4 |
+
author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
|
| 5 |
+
'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
|
| 6 |
+
title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
|
| 7 |
+
container='35th Conference on Neural Information Processing Systems '
|
| 8 |
+
'(NeurIPS 2021) Track on Datasets and Bench-marks.',
|
| 9 |
+
year='2021',
|
| 10 |
+
homepage='https://github.com/AlexTheBad/AP-10K',
|
| 11 |
+
),
|
| 12 |
+
keypoint_info={
|
| 13 |
+
0:
|
| 14 |
+
dict(
|
| 15 |
+
name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
|
| 16 |
+
1:
|
| 17 |
+
dict(
|
| 18 |
+
name='R_Eye',
|
| 19 |
+
id=1,
|
| 20 |
+
color=[255, 128, 0],
|
| 21 |
+
type='upper',
|
| 22 |
+
swap='L_Eye'),
|
| 23 |
+
2:
|
| 24 |
+
dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
|
| 25 |
+
3:
|
| 26 |
+
dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
|
| 27 |
+
4:
|
| 28 |
+
dict(
|
| 29 |
+
name='Root of tail',
|
| 30 |
+
id=4,
|
| 31 |
+
color=[51, 153, 255],
|
| 32 |
+
type='lower',
|
| 33 |
+
swap=''),
|
| 34 |
+
5:
|
| 35 |
+
dict(
|
| 36 |
+
name='L_Shoulder',
|
| 37 |
+
id=5,
|
| 38 |
+
color=[51, 153, 255],
|
| 39 |
+
type='upper',
|
| 40 |
+
swap='R_Shoulder'),
|
| 41 |
+
6:
|
| 42 |
+
dict(
|
| 43 |
+
name='L_Elbow',
|
| 44 |
+
id=6,
|
| 45 |
+
color=[51, 153, 255],
|
| 46 |
+
type='upper',
|
| 47 |
+
swap='R_Elbow'),
|
| 48 |
+
7:
|
| 49 |
+
dict(
|
| 50 |
+
name='L_F_Paw',
|
| 51 |
+
id=7,
|
| 52 |
+
color=[0, 255, 0],
|
| 53 |
+
type='upper',
|
| 54 |
+
swap='R_F_Paw'),
|
| 55 |
+
8:
|
| 56 |
+
dict(
|
| 57 |
+
name='R_Shoulder',
|
| 58 |
+
id=8,
|
| 59 |
+
color=[0, 255, 0],
|
| 60 |
+
type='upper',
|
| 61 |
+
swap='L_Shoulder'),
|
| 62 |
+
9:
|
| 63 |
+
dict(
|
| 64 |
+
name='R_Elbow',
|
| 65 |
+
id=9,
|
| 66 |
+
color=[255, 128, 0],
|
| 67 |
+
type='upper',
|
| 68 |
+
swap='L_Elbow'),
|
| 69 |
+
10:
|
| 70 |
+
dict(
|
| 71 |
+
name='R_F_Paw',
|
| 72 |
+
id=10,
|
| 73 |
+
color=[0, 255, 0],
|
| 74 |
+
type='lower',
|
| 75 |
+
swap='L_F_Paw'),
|
| 76 |
+
11:
|
| 77 |
+
dict(
|
| 78 |
+
name='L_Hip',
|
| 79 |
+
id=11,
|
| 80 |
+
color=[255, 128, 0],
|
| 81 |
+
type='lower',
|
| 82 |
+
swap='R_Hip'),
|
| 83 |
+
12:
|
| 84 |
+
dict(
|
| 85 |
+
name='L_Knee',
|
| 86 |
+
id=12,
|
| 87 |
+
color=[255, 128, 0],
|
| 88 |
+
type='lower',
|
| 89 |
+
swap='R_Knee'),
|
| 90 |
+
13:
|
| 91 |
+
dict(
|
| 92 |
+
name='L_B_Paw',
|
| 93 |
+
id=13,
|
| 94 |
+
color=[0, 255, 0],
|
| 95 |
+
type='lower',
|
| 96 |
+
swap='R_B_Paw'),
|
| 97 |
+
14:
|
| 98 |
+
dict(
|
| 99 |
+
name='R_Hip', id=14, color=[0, 255, 0], type='lower',
|
| 100 |
+
swap='L_Hip'),
|
| 101 |
+
15:
|
| 102 |
+
dict(
|
| 103 |
+
name='R_Knee',
|
| 104 |
+
id=15,
|
| 105 |
+
color=[0, 255, 0],
|
| 106 |
+
type='lower',
|
| 107 |
+
swap='L_Knee'),
|
| 108 |
+
16:
|
| 109 |
+
dict(
|
| 110 |
+
name='R_B_Paw',
|
| 111 |
+
id=16,
|
| 112 |
+
color=[0, 255, 0],
|
| 113 |
+
type='lower',
|
| 114 |
+
swap='L_B_Paw'),
|
| 115 |
+
},
|
| 116 |
+
skeleton_info={
|
| 117 |
+
0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
|
| 118 |
+
1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
|
| 119 |
+
2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
|
| 120 |
+
3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
|
| 121 |
+
4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
|
| 122 |
+
5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
|
| 123 |
+
6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
|
| 124 |
+
7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
|
| 125 |
+
8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
|
| 126 |
+
9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
|
| 127 |
+
10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
|
| 128 |
+
11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
|
| 129 |
+
12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
|
| 130 |
+
13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
|
| 131 |
+
14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
|
| 132 |
+
15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
|
| 133 |
+
16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
|
| 134 |
+
},
|
| 135 |
+
joint_weights=[
|
| 136 |
+
1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
|
| 137 |
+
1.5
|
| 138 |
+
],
|
| 139 |
+
sigmas=[
|
| 140 |
+
0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
|
| 141 |
+
0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
|
| 142 |
+
])
|
configs/_base_/default_runtime.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
checkpoint_config = dict(interval=10)
|
| 2 |
+
|
| 3 |
+
log_config = dict(
|
| 4 |
+
interval=50,
|
| 5 |
+
hooks=[
|
| 6 |
+
dict(type='TextLoggerHook'),
|
| 7 |
+
# dict(type='TensorboardLoggerHook')
|
| 8 |
+
# dict(type='PaviLoggerHook') # for internal services
|
| 9 |
+
])
|
| 10 |
+
|
| 11 |
+
log_level = 'INFO'
|
| 12 |
+
load_from = None
|
| 13 |
+
resume_from = None
|
| 14 |
+
dist_params = dict(backend='nccl')
|
| 15 |
+
workflow = [('train', 1)]
|
| 16 |
+
|
| 17 |
+
# disable opencv multithreading to avoid system being overloaded
|
| 18 |
+
opencv_num_threads = 0
|
| 19 |
+
# set multi-process start method as `fork` to speed up the training
|
| 20 |
+
mp_start_method = 'fork'
|
configs/demo_b.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_level = 'INFO'
|
| 2 |
+
load_from = None
|
| 3 |
+
resume_from = None
|
| 4 |
+
dist_params = dict(backend='nccl')
|
| 5 |
+
workflow = [('train', 1)]
|
| 6 |
+
checkpoint_config = dict(interval=20)
|
| 7 |
+
evaluation = dict(
|
| 8 |
+
interval=25,
|
| 9 |
+
metric=['PCK', 'NME', 'AUC', 'EPE'],
|
| 10 |
+
key_indicator='PCK',
|
| 11 |
+
gpu_collect=True,
|
| 12 |
+
res_folder='')
|
| 13 |
+
optimizer = dict(
|
| 14 |
+
type='Adam',
|
| 15 |
+
lr=1e-5,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
optimizer_config = dict(grad_clip=None)
|
| 19 |
+
# learning policy
|
| 20 |
+
lr_config = dict(
|
| 21 |
+
policy='step',
|
| 22 |
+
warmup='linear',
|
| 23 |
+
warmup_iters=1000,
|
| 24 |
+
warmup_ratio=0.001,
|
| 25 |
+
step=[160, 180])
|
| 26 |
+
total_epochs = 200
|
| 27 |
+
log_config = dict(
|
| 28 |
+
interval=50,
|
| 29 |
+
hooks=[
|
| 30 |
+
dict(type='TextLoggerHook'),
|
| 31 |
+
dict(type='TensorboardLoggerHook')
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
channel_cfg = dict(
|
| 35 |
+
num_output_channels=1,
|
| 36 |
+
dataset_joints=1,
|
| 37 |
+
dataset_channel=[
|
| 38 |
+
[
|
| 39 |
+
0,
|
| 40 |
+
],
|
| 41 |
+
],
|
| 42 |
+
inference_channel=[
|
| 43 |
+
0,
|
| 44 |
+
],
|
| 45 |
+
max_kpt_num=100)
|
| 46 |
+
|
| 47 |
+
# model settings
|
| 48 |
+
model = dict(
|
| 49 |
+
type='PoseAnythingModel',
|
| 50 |
+
pretrained='swinv2_small',
|
| 51 |
+
encoder_config=dict(
|
| 52 |
+
type='SwinTransformerV2',
|
| 53 |
+
embed_dim=96,
|
| 54 |
+
depths=[2, 2, 18, 2],
|
| 55 |
+
num_heads=[3, 6, 12, 24],
|
| 56 |
+
window_size=16,
|
| 57 |
+
drop_path_rate=0.3,
|
| 58 |
+
img_size=256,
|
| 59 |
+
upsample="bilinear"
|
| 60 |
+
),
|
| 61 |
+
keypoint_head=dict(
|
| 62 |
+
type='PoseHead',
|
| 63 |
+
in_channels=768,
|
| 64 |
+
transformer=dict(
|
| 65 |
+
type='EncoderDecoder',
|
| 66 |
+
d_model=256,
|
| 67 |
+
nhead=8,
|
| 68 |
+
num_encoder_layers=3,
|
| 69 |
+
num_decoder_layers=3,
|
| 70 |
+
graph_decoder='pre',
|
| 71 |
+
dim_feedforward=768,
|
| 72 |
+
dropout=0.1,
|
| 73 |
+
similarity_proj_dim=256,
|
| 74 |
+
dynamic_proj_dim=128,
|
| 75 |
+
activation="relu",
|
| 76 |
+
normalize_before=False,
|
| 77 |
+
return_intermediate_dec=True),
|
| 78 |
+
share_kpt_branch=False,
|
| 79 |
+
num_decoder_layer=3,
|
| 80 |
+
with_heatmap_loss=True,
|
| 81 |
+
|
| 82 |
+
heatmap_loss_weight=2.0,
|
| 83 |
+
support_order_dropout=-1,
|
| 84 |
+
positional_encoding=dict(
|
| 85 |
+
type='SinePositionalEncoding', num_feats=128, normalize=True)),
|
| 86 |
+
# training and testing settings
|
| 87 |
+
train_cfg=dict(),
|
| 88 |
+
test_cfg=dict(
|
| 89 |
+
flip_test=False,
|
| 90 |
+
post_process='default',
|
| 91 |
+
shift_heatmap=True,
|
| 92 |
+
modulate_kernel=11))
|
| 93 |
+
|
| 94 |
+
data_cfg = dict(
|
| 95 |
+
image_size=[256, 256],
|
| 96 |
+
heatmap_size=[64, 64],
|
| 97 |
+
num_output_channels=channel_cfg['num_output_channels'],
|
| 98 |
+
num_joints=channel_cfg['dataset_joints'],
|
| 99 |
+
dataset_channel=channel_cfg['dataset_channel'],
|
| 100 |
+
inference_channel=channel_cfg['inference_channel'])
|
| 101 |
+
|
| 102 |
+
train_pipeline = [
|
| 103 |
+
dict(type='LoadImageFromFile'),
|
| 104 |
+
dict(
|
| 105 |
+
type='TopDownGetRandomScaleRotation', rot_factor=15,
|
| 106 |
+
scale_factor=0.15),
|
| 107 |
+
dict(type='TopDownAffineFewShot'),
|
| 108 |
+
dict(type='ToTensor'),
|
| 109 |
+
dict(
|
| 110 |
+
type='NormalizeTensor',
|
| 111 |
+
mean=[0.485, 0.456, 0.406],
|
| 112 |
+
std=[0.229, 0.224, 0.225]),
|
| 113 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 114 |
+
dict(
|
| 115 |
+
type='Collect',
|
| 116 |
+
keys=['img', 'target', 'target_weight'],
|
| 117 |
+
meta_keys=[
|
| 118 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
|
| 119 |
+
'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
|
| 120 |
+
]),
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
valid_pipeline = [
|
| 124 |
+
dict(type='LoadImageFromFile'),
|
| 125 |
+
dict(type='TopDownAffineFewShot'),
|
| 126 |
+
dict(type='ToTensor'),
|
| 127 |
+
dict(
|
| 128 |
+
type='NormalizeTensor',
|
| 129 |
+
mean=[0.485, 0.456, 0.406],
|
| 130 |
+
std=[0.229, 0.224, 0.225]),
|
| 131 |
+
dict(type='TopDownGenerateTargetFewShot', sigma=1),
|
| 132 |
+
dict(
|
| 133 |
+
type='Collect',
|
| 134 |
+
keys=['img', 'target', 'target_weight'],
|
| 135 |
+
meta_keys=[
|
| 136 |
+
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
|
| 137 |
+
'flip_pairs', 'category_id',
|
| 138 |
+
'skeleton',
|
| 139 |
+
]),
|
| 140 |
+
]
|
| 141 |
+
|
| 142 |
+
test_pipeline = valid_pipeline
|
| 143 |
+
|
| 144 |
+
data_root = 'data/mp100'
|
| 145 |
+
data = dict(
|
| 146 |
+
samples_per_gpu=8,
|
| 147 |
+
workers_per_gpu=8,
|
| 148 |
+
train=dict(
|
| 149 |
+
type='TransformerPoseDataset',
|
| 150 |
+
ann_file=f'{data_root}/annotations/mp100_split1_train.json',
|
| 151 |
+
img_prefix=f'{data_root}/images/',
|
| 152 |
+
# img_prefix=f'{data_root}',
|
| 153 |
+
data_cfg=data_cfg,
|
| 154 |
+
valid_class_ids=None,
|
| 155 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 156 |
+
num_shots=1,
|
| 157 |
+
pipeline=train_pipeline),
|
| 158 |
+
val=dict(
|
| 159 |
+
type='TransformerPoseDataset',
|
| 160 |
+
ann_file=f'{data_root}/annotations/mp100_split1_val.json',
|
| 161 |
+
img_prefix=f'{data_root}/images/',
|
| 162 |
+
# img_prefix=f'{data_root}',
|
| 163 |
+
data_cfg=data_cfg,
|
| 164 |
+
valid_class_ids=None,
|
| 165 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 166 |
+
num_shots=1,
|
| 167 |
+
num_queries=15,
|
| 168 |
+
num_episodes=100,
|
| 169 |
+
pipeline=valid_pipeline),
|
| 170 |
+
test=dict(
|
| 171 |
+
type='TestPoseDataset',
|
| 172 |
+
ann_file=f'{data_root}/annotations/mp100_split1_test.json',
|
| 173 |
+
img_prefix=f'{data_root}/images/',
|
| 174 |
+
# img_prefix=f'{data_root}',
|
| 175 |
+
data_cfg=data_cfg,
|
| 176 |
+
valid_class_ids=None,
|
| 177 |
+
max_kpt_num=channel_cfg['max_kpt_num'],
|
| 178 |
+
num_shots=1,
|
| 179 |
+
num_queries=15,
|
| 180 |
+
num_episodes=200,
|
| 181 |
+
pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
|
| 182 |
+
pipeline=test_pipeline),
|
| 183 |
+
)
|
| 184 |
+
vis_backends = [
|
| 185 |
+
dict(type='LocalVisBackend'),
|
| 186 |
+
dict(type='TensorboardVisBackend'),
|
| 187 |
+
]
|
| 188 |
+
visualizer = dict(
|
| 189 |
+
type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
|
| 190 |
+
|
| 191 |
+
shuffle_cfg = dict(interval=1)
|
demo_text.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import copy
|
| 3 |
+
import os
|
| 4 |
+
import pickle
|
| 5 |
+
import random
|
| 6 |
+
import cv2
|
| 7 |
+
import numpy as np
|
| 8 |
+
import string
|
| 9 |
+
import torch
|
| 10 |
+
from mmcv import Config, DictAction
|
| 11 |
+
from mmcv.cnn import fuse_conv_bn
|
| 12 |
+
from mmcv.runner import load_checkpoint
|
| 13 |
+
from mmpose.core import wrap_fp16_model
|
| 14 |
+
from mmpose.models import build_posenet
|
| 15 |
+
from torchvision import transforms
|
| 16 |
+
from models import *
|
| 17 |
+
import torchvision.transforms.functional as F
|
| 18 |
+
|
| 19 |
+
from tools.visualization import plot_results, plot_query_results, plot_modified_query
|
| 20 |
+
import ast
|
| 21 |
+
import shutil
|
| 22 |
+
|
| 23 |
+
COLORS = [
|
| 24 |
+
[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
|
| 25 |
+
[85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
|
| 26 |
+
[0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
|
| 27 |
+
[255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]]
|
| 28 |
+
|
| 29 |
+
class Resize_Pad:
|
| 30 |
+
def __init__(self, w=256, h=256):
|
| 31 |
+
self.w = w
|
| 32 |
+
self.h = h
|
| 33 |
+
|
| 34 |
+
def __call__(self, image):
|
| 35 |
+
_, w_1, h_1 = image.shape
|
| 36 |
+
ratio_1 = w_1 / h_1
|
| 37 |
+
# check if the original and final aspect ratios are the same within a margin
|
| 38 |
+
if round(ratio_1, 2) != 1:
|
| 39 |
+
# padding to preserve aspect ratio
|
| 40 |
+
if ratio_1 > 1: # Make the image higher
|
| 41 |
+
hp = int(w_1 - h_1)
|
| 42 |
+
hp = hp // 2
|
| 43 |
+
image = F.pad(image, (hp, 0, hp, 0), 0, "constant")
|
| 44 |
+
return F.resize(image, [self.h, self.w])
|
| 45 |
+
else:
|
| 46 |
+
wp = int(h_1 - w_1)
|
| 47 |
+
wp = wp // 2
|
| 48 |
+
image = F.pad(image, (0, wp, 0, wp), 0, "constant")
|
| 49 |
+
return F.resize(image, [self.h, self.w])
|
| 50 |
+
else:
|
| 51 |
+
return F.resize(image, [self.h, self.w])
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def transform_keypoints_to_pad_and_resize(keypoints, image_size):
|
| 55 |
+
trans_keypoints = keypoints.clone()
|
| 56 |
+
h, w = image_size[:2]
|
| 57 |
+
ratio_1 = w / h
|
| 58 |
+
if ratio_1 > 1:
|
| 59 |
+
# width is bigger than height - pad height
|
| 60 |
+
hp = int(w - h)
|
| 61 |
+
hp = hp // 2
|
| 62 |
+
trans_keypoints[:, 1] = keypoints[:, 1] + hp
|
| 63 |
+
trans_keypoints *= (256. / w)
|
| 64 |
+
else:
|
| 65 |
+
# height is bigger than width - pad width
|
| 66 |
+
wp = int(image_size[1] - image_size[0])
|
| 67 |
+
wp = wp // 2
|
| 68 |
+
trans_keypoints[:, 0] = keypoints[:, 0] + wp
|
| 69 |
+
trans_keypoints *= (256. / h)
|
| 70 |
+
return trans_keypoints
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def parse_args():
|
| 74 |
+
parser = argparse.ArgumentParser(description='Pose Anything Demo')
|
| 75 |
+
parser.add_argument('--support_points', help='support keypoints text descriptions')
|
| 76 |
+
parser.add_argument('--support_skeleton', help='list of keypoints skeleton')
|
| 77 |
+
parser.add_argument('--query', help='Image file')
|
| 78 |
+
parser.add_argument('--config', default=None, help='test config file path')
|
| 79 |
+
parser.add_argument('--checkpoint', default=None, help='checkpoint file')
|
| 80 |
+
parser.add_argument('--outdir', default='output', help='checkpoint file')
|
| 81 |
+
|
| 82 |
+
parser.add_argument(
|
| 83 |
+
'--fuse-conv-bn',
|
| 84 |
+
action='store_true',
|
| 85 |
+
help='Whether to fuse conv and bn, this will slightly increase'
|
| 86 |
+
'the inference speed')
|
| 87 |
+
parser.add_argument(
|
| 88 |
+
'--cfg-options',
|
| 89 |
+
nargs='+',
|
| 90 |
+
action=DictAction,
|
| 91 |
+
default={},
|
| 92 |
+
help='override some settings in the used config, the key-value pair '
|
| 93 |
+
'in xxx=yyy format will be merged into config file. For example, '
|
| 94 |
+
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
|
| 95 |
+
args = parser.parse_args()
|
| 96 |
+
return args
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def merge_configs(cfg1, cfg2):
|
| 100 |
+
# Merge cfg2 into cfg1
|
| 101 |
+
# Overwrite cfg1 if repeated, ignore if value is None.
|
| 102 |
+
cfg1 = {} if cfg1 is None else cfg1.copy()
|
| 103 |
+
cfg2 = {} if cfg2 is None else cfg2
|
| 104 |
+
for k, v in cfg2.items():
|
| 105 |
+
if v:
|
| 106 |
+
cfg1[k] = v
|
| 107 |
+
return cfg1
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def main():
|
| 111 |
+
random.seed(0)
|
| 112 |
+
np.random.seed(0)
|
| 113 |
+
torch.manual_seed(0)
|
| 114 |
+
|
| 115 |
+
args = parse_args()
|
| 116 |
+
cfg = Config.fromfile(args.config)
|
| 117 |
+
|
| 118 |
+
if args.cfg_options is not None:
|
| 119 |
+
cfg.merge_from_dict(args.cfg_options)
|
| 120 |
+
# set cudnn_benchmark
|
| 121 |
+
if cfg.get('cudnn_benchmark', False):
|
| 122 |
+
torch.backends.cudnn.benchmark = True
|
| 123 |
+
cfg.data.test.test_mode = True
|
| 124 |
+
|
| 125 |
+
os.makedirs(args.outdir, exist_ok=True)
|
| 126 |
+
|
| 127 |
+
# Load data
|
| 128 |
+
point_descriptions = ast.literal_eval(args.support_points)
|
| 129 |
+
query_img = cv2.imread(args.query)
|
| 130 |
+
if query_img is None:
|
| 131 |
+
raise ValueError('Fail to read image')
|
| 132 |
+
|
| 133 |
+
# just a placeholder, we don't have input keypoints
|
| 134 |
+
kp_src = torch.zeros((len(point_descriptions), 2))
|
| 135 |
+
|
| 136 |
+
preprocess = transforms.Compose([
|
| 137 |
+
transforms.ToTensor(),
|
| 138 |
+
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
|
| 139 |
+
Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)])
|
| 140 |
+
|
| 141 |
+
if args.support_skeleton is not None:
|
| 142 |
+
skeleton = ast.literal_eval(args.support_skeleton)
|
| 143 |
+
if len(skeleton) == 0:
|
| 144 |
+
skeleton = [(0, 0)]
|
| 145 |
+
|
| 146 |
+
model_device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 147 |
+
|
| 148 |
+
query_img = preprocess(query_img).flip(0)[None].to(model_device)
|
| 149 |
+
# Create heatmap from keypoints
|
| 150 |
+
genHeatMap = TopDownGenerateTargetFewShot()
|
| 151 |
+
data_cfg = cfg.data_cfg
|
| 152 |
+
data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size])
|
| 153 |
+
data_cfg['joint_weights'] = None
|
| 154 |
+
data_cfg['use_different_joint_weights'] = False
|
| 155 |
+
kp_src_3d = torch.concatenate((kp_src, torch.zeros(kp_src.shape[0], 1)), dim=-1)
|
| 156 |
+
kp_src_3d_weight = torch.concatenate((torch.ones_like(kp_src), torch.zeros(kp_src.shape[0], 1)), dim=-1)
|
| 157 |
+
|
| 158 |
+
# everything that is related to the support image is used as placeholder
|
| 159 |
+
target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg, kp_src_3d, kp_src_3d_weight, sigma=1)
|
| 160 |
+
target_s = torch.tensor(target_s).float()[None]
|
| 161 |
+
target_weight_s = torch.tensor(target_weight_s).float()[None].to(model_device)
|
| 162 |
+
|
| 163 |
+
data = {
|
| 164 |
+
'img_s': [0],
|
| 165 |
+
'img_q': query_img,
|
| 166 |
+
'target_s': [target_s],
|
| 167 |
+
'target_weight_s': [target_weight_s],
|
| 168 |
+
'target_q': None,
|
| 169 |
+
'target_weight_q': None,
|
| 170 |
+
'return_loss': False,
|
| 171 |
+
'img_metas': [{'sample_skeleton': [skeleton],
|
| 172 |
+
'query_skeleton': skeleton,
|
| 173 |
+
'sample_point_descriptions': np.array([point_descriptions]),
|
| 174 |
+
'sample_joints_3d': [kp_src_3d],
|
| 175 |
+
'query_joints_3d': kp_src_3d,
|
| 176 |
+
'sample_center': [kp_src.mean(dim=0)],
|
| 177 |
+
'query_center': kp_src.mean(dim=0),
|
| 178 |
+
'sample_scale': [kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0]],
|
| 179 |
+
'query_scale': kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0],
|
| 180 |
+
'sample_rotation': [0],
|
| 181 |
+
'query_rotation': 0,
|
| 182 |
+
'sample_bbox_score': [1],
|
| 183 |
+
'query_bbox_score': 1,
|
| 184 |
+
'query_image_file': '',
|
| 185 |
+
'sample_image_file': [''],
|
| 186 |
+
}]
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
# Load model
|
| 190 |
+
model = build_posenet(cfg.model)
|
| 191 |
+
fp16_cfg = cfg.get('fp16', None)
|
| 192 |
+
if fp16_cfg is not None:
|
| 193 |
+
wrap_fp16_model(model)
|
| 194 |
+
load_checkpoint(model, args.checkpoint, map_location='cpu')
|
| 195 |
+
if args.fuse_conv_bn:
|
| 196 |
+
model = fuse_conv_bn(model)
|
| 197 |
+
model.to(model_device)
|
| 198 |
+
model.eval()
|
| 199 |
+
|
| 200 |
+
with torch.no_grad():
|
| 201 |
+
outputs = model(**data)
|
| 202 |
+
|
| 203 |
+
# visualize results
|
| 204 |
+
vis_q_weight = target_weight_s[0]
|
| 205 |
+
vis_q_image = query_img[0].detach().cpu().numpy().transpose(1, 2, 0)
|
| 206 |
+
|
| 207 |
+
name_idx = plot_query_results(vis_q_image, vis_q_weight, skeleton, torch.tensor(outputs['points']).squeeze(0), out_dir=args.outdir)
|
| 208 |
+
shutil.copyfile(args.query, f'./{args.outdir}/{str(name_idx)}_query_in.png')
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
if __name__ == '__main__':
|
| 212 |
+
main()
|
docker/Dockerfile
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ARG PYTORCH="2.0.1"
|
| 2 |
+
ARG CUDA="11.7"
|
| 3 |
+
ARG CUDNN="8"
|
| 4 |
+
|
| 5 |
+
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
|
| 6 |
+
|
| 7 |
+
ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
|
| 8 |
+
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
|
| 9 |
+
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
|
| 10 |
+
ENV TZ=Asia/Kolkata DEBIAN_FRONTEND=noninteractive
|
| 11 |
+
# To fix GPG key error when running apt-get update
|
| 12 |
+
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
|
| 13 |
+
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
|
| 14 |
+
|
| 15 |
+
RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx\
|
| 16 |
+
&& apt-get clean \
|
| 17 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
+
|
| 19 |
+
# Install xtcocotools
|
| 20 |
+
RUN pip install cython
|
| 21 |
+
RUN pip install xtcocotools
|
| 22 |
+
# Install MMEngine and MMCV
|
| 23 |
+
RUN pip install openmim
|
| 24 |
+
RUN mim install mmengine
|
| 25 |
+
RUN mim install "mmpose==0.28.1"
|
| 26 |
+
RUN mim install "mmcv-full==1.5.3"
|
| 27 |
+
RUN pip install -U torchmetrics timm
|
| 28 |
+
RUN pip install numpy scipy --upgrade
|
| 29 |
+
RUN pip install future tensorboard
|
| 30 |
+
|
| 31 |
+
# some other requirments
|
| 32 |
+
RUN pip install git+https://github.com/openai/CLIP.git
|
| 33 |
+
RUN pip install yapf==0.40.1
|
| 34 |
+
RUN pip install transformers
|
| 35 |
+
|
| 36 |
+
WORKDIR CapeX
|
| 37 |
+
|
| 38 |
+
COPY models CapeX/models
|
| 39 |
+
COPY configs CapeX/configs
|
| 40 |
+
COPY pretrained CapeX/pretrained
|
| 41 |
+
COPY requirements.txt CapeX/
|
| 42 |
+
COPY tools CapeX/tools
|
| 43 |
+
COPY setup.cfg CapeX/
|
| 44 |
+
COPY setup.py CapeX/
|
| 45 |
+
COPY test.py CapeX/
|
| 46 |
+
COPY train.py CapeX/
|
| 47 |
+
COPY README.md CapeX/
|
| 48 |
+
COPY run_me.sh CapeX/
|
| 49 |
+
|
| 50 |
+
RUN mkdir -p CapeX/data/mp100
|
| 51 |
+
WORKDIR CapeX
|
| 52 |
+
|
| 53 |
+
# Install MMPose
|
| 54 |
+
RUN conda clean --all
|
| 55 |
+
ENV FORCE_CUDA="1"
|
| 56 |
+
RUN python setup.py develop
|
| 57 |
+
|
| 58 |
+
#CMD ["bash"]
|
| 59 |
+
#CMD ["/bin/bash", "-c", "chmod +x run_me.sh && ./run_me.sh"]
|
environment.yml
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: capex
|
| 2 |
+
channels:
|
| 3 |
+
- pytorch
|
| 4 |
+
- nvidia
|
| 5 |
+
- defaults
|
| 6 |
+
dependencies:
|
| 7 |
+
- _libgcc_mutex=0.1
|
| 8 |
+
- _openmp_mutex=5.1
|
| 9 |
+
- blas=1.0
|
| 10 |
+
- ca-certificates=2024.3.11
|
| 11 |
+
- cuda-cudart=12.1.105
|
| 12 |
+
- cuda-cupti=12.1.105
|
| 13 |
+
- cuda-libraries=12.1.0
|
| 14 |
+
- cuda-nvrtc=12.1.105
|
| 15 |
+
- cuda-nvtx=12.1.105
|
| 16 |
+
- cuda-opencl=12.4.99
|
| 17 |
+
- cuda-runtime=12.1.0
|
| 18 |
+
- cudatoolkit=11.8.0
|
| 19 |
+
- filelock=3.13.1
|
| 20 |
+
- gmp=6.2.1
|
| 21 |
+
- gmpy2=2.1.2
|
| 22 |
+
- intel-openmp=2023.1.0
|
| 23 |
+
- jinja2=3.1.3
|
| 24 |
+
- ld_impl_linux-64=2.38
|
| 25 |
+
- libcublas=12.1.0.26
|
| 26 |
+
- libcufft=11.0.2.4
|
| 27 |
+
- libcufile=1.9.0.20
|
| 28 |
+
- libcurand=10.3.5.119
|
| 29 |
+
- libcusolver=11.4.4.55
|
| 30 |
+
- libcusparse=12.0.2.55
|
| 31 |
+
- libffi=3.4.4
|
| 32 |
+
- libgcc-ng=11.2.0
|
| 33 |
+
- libgomp=11.2.0
|
| 34 |
+
- libnpp=12.0.2.50
|
| 35 |
+
- libnvjitlink=12.1.105
|
| 36 |
+
- libnvjpeg=12.1.1.14
|
| 37 |
+
- libstdcxx-ng=11.2.0
|
| 38 |
+
- markupsafe=2.1.3
|
| 39 |
+
- mkl=2023.1.0
|
| 40 |
+
- mpc=1.1.0
|
| 41 |
+
- mpfr=4.0.2
|
| 42 |
+
- mpmath=1.3.0
|
| 43 |
+
- ncurses=6.4
|
| 44 |
+
- networkx=3.1
|
| 45 |
+
- openssl=3.0.13
|
| 46 |
+
- pip=23.3.1
|
| 47 |
+
- python=3.8.18
|
| 48 |
+
- pytorch-cuda=12.1
|
| 49 |
+
- pytorch-mutex=1.0
|
| 50 |
+
- readline=8.2
|
| 51 |
+
- setuptools=68.2.2
|
| 52 |
+
- sqlite=3.41.2
|
| 53 |
+
- sympy=1.12
|
| 54 |
+
- tbb=2021.8.0
|
| 55 |
+
- tk=8.6.12
|
| 56 |
+
- typing_extensions=4.9.0
|
| 57 |
+
- wheel=0.41.2
|
| 58 |
+
- xz=5.4.6
|
| 59 |
+
- zlib=1.2.13
|
| 60 |
+
- pip:
|
| 61 |
+
- absl-py==2.1.0
|
| 62 |
+
- addict==2.4.0
|
| 63 |
+
- aiofiles==23.2.1
|
| 64 |
+
- aiohttp==3.9.3
|
| 65 |
+
- aiosignal==1.3.1
|
| 66 |
+
- altair==5.3.0
|
| 67 |
+
- annotated-types==0.6.0
|
| 68 |
+
- antlr4-python3-runtime==4.9.3
|
| 69 |
+
- anyio==4.3.0
|
| 70 |
+
- async-timeout==4.0.3
|
| 71 |
+
- attrs==23.2.0
|
| 72 |
+
- cachetools==5.3.3
|
| 73 |
+
- certifi==2024.2.2
|
| 74 |
+
- charset-normalizer==3.3.2
|
| 75 |
+
- chumpy==0.70
|
| 76 |
+
- click==8.1.7
|
| 77 |
+
- git+https://github.com/openai/CLIP.git
|
| 78 |
+
- contourpy==1.1.1
|
| 79 |
+
- cycler==0.12.1
|
| 80 |
+
- cython==3.0.9
|
| 81 |
+
- dnspython==2.6.1
|
| 82 |
+
- email-validator==2.1.1
|
| 83 |
+
- exceptiongroup==1.2.1
|
| 84 |
+
- fastapi==0.111.0
|
| 85 |
+
- fastapi-cli==0.0.3
|
| 86 |
+
- ffmpy==0.3.2
|
| 87 |
+
- fonttools==4.49.0
|
| 88 |
+
- frozenlist==1.4.1
|
| 89 |
+
- fsspec==2024.2.0
|
| 90 |
+
- ftfy==6.2.0
|
| 91 |
+
- future==1.0.0
|
| 92 |
+
- google-auth==2.28.2
|
| 93 |
+
- google-auth-oauthlib==1.0.0
|
| 94 |
+
- gradio==4.31.0
|
| 95 |
+
- gradio-client==0.16.2
|
| 96 |
+
- grpcio==1.62.1
|
| 97 |
+
- h11==0.14.0
|
| 98 |
+
- httpcore==1.0.5
|
| 99 |
+
- httptools==0.6.1
|
| 100 |
+
- httpx==0.27.0
|
| 101 |
+
- huggingface-hub==0.21.4
|
| 102 |
+
- idna==3.6
|
| 103 |
+
- importlib-metadata==7.0.1
|
| 104 |
+
- importlib-resources==6.1.2
|
| 105 |
+
- joblib==1.4.0
|
| 106 |
+
- json-tricks==3.17.3
|
| 107 |
+
- jsonschema==4.22.0
|
| 108 |
+
- jsonschema-specifications==2023.12.1
|
| 109 |
+
- kiwisolver==1.4.5
|
| 110 |
+
- kornia==0.7.2
|
| 111 |
+
- kornia-rs==0.1.3
|
| 112 |
+
- lightning-utilities==0.11.2
|
| 113 |
+
- markdown==3.5.2
|
| 114 |
+
- markdown-it-py==3.0.0
|
| 115 |
+
- matplotlib==3.7.5
|
| 116 |
+
- mdurl==0.1.2
|
| 117 |
+
- mmcv-full==1.6.2
|
| 118 |
+
- mmpose==0.29.0
|
| 119 |
+
- multidict==6.0.5
|
| 120 |
+
- munkres==1.1.4
|
| 121 |
+
- numpy==1.24.4
|
| 122 |
+
- nvidia-cublas-cu12==12.1.3.1
|
| 123 |
+
- nvidia-cuda-cupti-cu12==12.1.105
|
| 124 |
+
- nvidia-cuda-nvrtc-cu12==12.1.105
|
| 125 |
+
- nvidia-cuda-runtime-cu12==12.1.105
|
| 126 |
+
- nvidia-cudnn-cu12==8.9.2.26
|
| 127 |
+
- nvidia-cufft-cu12==11.0.2.54
|
| 128 |
+
- nvidia-curand-cu12==10.3.2.106
|
| 129 |
+
- nvidia-cusolver-cu12==11.4.5.107
|
| 130 |
+
- nvidia-cusparse-cu12==12.1.0.106
|
| 131 |
+
- nvidia-nccl-cu12==2.19.3
|
| 132 |
+
- nvidia-nvjitlink-cu12==12.4.99
|
| 133 |
+
- nvidia-nvtx-cu12==12.1.105
|
| 134 |
+
- oauthlib==3.2.2
|
| 135 |
+
- omegaconf==2.3.0
|
| 136 |
+
- opencv-python==4.9.0.80
|
| 137 |
+
- orjson==3.10.3
|
| 138 |
+
- packaging==23.2
|
| 139 |
+
- pandas==2.0.3
|
| 140 |
+
- pillow==10.2.0
|
| 141 |
+
- pkgutil-resolve-name==1.3.10
|
| 142 |
+
- platformdirs==4.2.0
|
| 143 |
+
- protobuf==4.25.3
|
| 144 |
+
- pyasn1==0.5.1
|
| 145 |
+
- pyasn1-modules==0.3.0
|
| 146 |
+
- pydantic==2.7.1
|
| 147 |
+
- pydantic-core==2.18.2
|
| 148 |
+
- pydub==0.25.1
|
| 149 |
+
- pygments==2.18.0
|
| 150 |
+
- pyparsing==3.1.2
|
| 151 |
+
- python-dateutil==2.9.0.post0
|
| 152 |
+
- python-dotenv==1.0.1
|
| 153 |
+
- python-multipart==0.0.9
|
| 154 |
+
- pytorch-lightning==2.2.1
|
| 155 |
+
- pytz==2024.1
|
| 156 |
+
- pyyaml==6.0.1
|
| 157 |
+
- referencing==0.35.1
|
| 158 |
+
- regex==2023.12.25
|
| 159 |
+
- requests==2.31.0
|
| 160 |
+
- requests-oauthlib==1.4.0
|
| 161 |
+
- rich==13.7.1
|
| 162 |
+
- rpds-py==0.18.1
|
| 163 |
+
- rsa==4.9
|
| 164 |
+
- ruff==0.4.4
|
| 165 |
+
- safetensors==0.4.2
|
| 166 |
+
- scikit-learn==1.3.2
|
| 167 |
+
- scipy==1.10.1
|
| 168 |
+
- semantic-version==2.10.0
|
| 169 |
+
- sentencepiece==0.2.0
|
| 170 |
+
- shellingham==1.5.4
|
| 171 |
+
- six==1.16.0
|
| 172 |
+
- sniffio==1.3.1
|
| 173 |
+
- starlette==0.37.2
|
| 174 |
+
- tensorboard==2.14.0
|
| 175 |
+
- tensorboard-data-server==0.7.2
|
| 176 |
+
- threadpoolctl==3.4.0
|
| 177 |
+
- timm==0.4.12
|
| 178 |
+
- tokenizers==0.15.2
|
| 179 |
+
- tomli==2.0.1
|
| 180 |
+
- tomlkit==0.12.0
|
| 181 |
+
- toolz==0.12.1
|
| 182 |
+
- torch==2.2.1
|
| 183 |
+
- torchmetrics==1.3.2
|
| 184 |
+
- torchvision==0.17.1
|
| 185 |
+
- tqdm==4.66.2
|
| 186 |
+
- transformers==4.38.2
|
| 187 |
+
- triton==2.2.0
|
| 188 |
+
- typer==0.12.3
|
| 189 |
+
- tzdata==2024.1
|
| 190 |
+
- ujson==5.9.0
|
| 191 |
+
- urllib3==2.2.1
|
| 192 |
+
- uvicorn==0.29.0
|
| 193 |
+
- uvloop==0.19.0
|
| 194 |
+
- watchfiles==0.21.0
|
| 195 |
+
- wcwidth==0.2.13
|
| 196 |
+
- websockets==11.0.3
|
| 197 |
+
- werkzeug==3.0.1
|
| 198 |
+
- xtcocotools==1.14.3
|
| 199 |
+
- yapf==0.40.1
|
| 200 |
+
- yarl==1.9.4
|
| 201 |
+
- zipp==3.17.0
|
examples/animal.png
ADDED
|
examples/car.png
ADDED
|
examples/chair.png
ADDED
|
examples/person.png
ADDED
|
models/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
0.2.0
|
models/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .core import * # noqa
|
| 2 |
+
from .datasets import * # noqa
|
| 3 |
+
from .models import * # noqa
|
models/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (195 Bytes). View file
|
|
|
models/apis/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .train import train_model
|
| 2 |
+
|
| 3 |
+
__all__ = [
|
| 4 |
+
'train_model'
|
| 5 |
+
]
|
models/apis/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (206 Bytes). View file
|
|
|
models/apis/__pycache__/train.cpython-38.pyc
ADDED
|
Binary file (3.15 kB). View file
|
|
|
models/apis/train.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from models.core.custom_hooks.shuffle_hooks import ShufflePairedSamplesHook
|
| 5 |
+
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
|
| 6 |
+
from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook,
|
| 7 |
+
build_optimizer)
|
| 8 |
+
from mmpose.core import DistEvalHook, EvalHook, Fp16OptimizerHook
|
| 9 |
+
from mmpose.datasets import build_dataloader
|
| 10 |
+
from mmpose.utils import get_root_logger
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def train_model(model,
|
| 14 |
+
dataset,
|
| 15 |
+
val_dataset,
|
| 16 |
+
cfg,
|
| 17 |
+
distributed=False,
|
| 18 |
+
validate=False,
|
| 19 |
+
timestamp=None,
|
| 20 |
+
meta=None):
|
| 21 |
+
"""Train model entry function.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
model (nn.Module): The model to be trained.
|
| 25 |
+
dataset (Dataset): Train dataset.
|
| 26 |
+
cfg (dict): The config dict for training.
|
| 27 |
+
distributed (bool): Whether to use distributed training.
|
| 28 |
+
Default: False.
|
| 29 |
+
validate (bool): Whether to do evaluation. Default: False.
|
| 30 |
+
timestamp (str | None): Local time for runner. Default: None.
|
| 31 |
+
meta (dict | None): Meta dict to record some important information.
|
| 32 |
+
Default: None
|
| 33 |
+
"""
|
| 34 |
+
logger = get_root_logger(cfg.log_level)
|
| 35 |
+
|
| 36 |
+
# prepare data loaders
|
| 37 |
+
dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
|
| 38 |
+
dataloader_setting = dict(
|
| 39 |
+
samples_per_gpu=cfg.data.get('samples_per_gpu', {}),
|
| 40 |
+
workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
|
| 41 |
+
# cfg.gpus will be ignored if distributed
|
| 42 |
+
num_gpus=len(cfg.gpu_ids),
|
| 43 |
+
dist=distributed,
|
| 44 |
+
seed=cfg.seed,
|
| 45 |
+
pin_memory=False,
|
| 46 |
+
)
|
| 47 |
+
dataloader_setting = dict(dataloader_setting,
|
| 48 |
+
**cfg.data.get('train_dataloader', {}))
|
| 49 |
+
|
| 50 |
+
data_loaders = [
|
| 51 |
+
build_dataloader(ds, **dataloader_setting) for ds in dataset
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
# put model on gpus
|
| 55 |
+
if distributed:
|
| 56 |
+
find_unused_parameters = cfg.get('find_unused_parameters',
|
| 57 |
+
False) # NOTE: True has been modified to False for faster training.
|
| 58 |
+
# Sets the `find_unused_parameters` parameter in
|
| 59 |
+
# torch.nn.parallel.DistributedDataParallel
|
| 60 |
+
model = MMDistributedDataParallel(
|
| 61 |
+
model.cuda(),
|
| 62 |
+
device_ids=[torch.cuda.current_device()],
|
| 63 |
+
broadcast_buffers=False,
|
| 64 |
+
find_unused_parameters=find_unused_parameters)
|
| 65 |
+
else:
|
| 66 |
+
model = MMDataParallel(
|
| 67 |
+
model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
|
| 68 |
+
|
| 69 |
+
# build runner
|
| 70 |
+
optimizer = build_optimizer(model, cfg.optimizer)
|
| 71 |
+
runner = EpochBasedRunner(
|
| 72 |
+
model,
|
| 73 |
+
optimizer=optimizer,
|
| 74 |
+
work_dir=cfg.work_dir,
|
| 75 |
+
logger=logger,
|
| 76 |
+
meta=meta)
|
| 77 |
+
# an ugly workaround to make .log and .log.json filenames the same
|
| 78 |
+
runner.timestamp = timestamp
|
| 79 |
+
|
| 80 |
+
# fp16 setting
|
| 81 |
+
fp16_cfg = cfg.get('fp16', None)
|
| 82 |
+
if fp16_cfg is not None:
|
| 83 |
+
optimizer_config = Fp16OptimizerHook(
|
| 84 |
+
**cfg.optimizer_config, **fp16_cfg, distributed=distributed)
|
| 85 |
+
elif distributed and 'type' not in cfg.optimizer_config:
|
| 86 |
+
optimizer_config = OptimizerHook(**cfg.optimizer_config)
|
| 87 |
+
else:
|
| 88 |
+
optimizer_config = cfg.optimizer_config
|
| 89 |
+
|
| 90 |
+
# register hooks
|
| 91 |
+
runner.register_training_hooks(cfg.lr_config, optimizer_config,
|
| 92 |
+
cfg.checkpoint_config, cfg.log_config,
|
| 93 |
+
cfg.get('momentum_config', None))
|
| 94 |
+
if distributed:
|
| 95 |
+
runner.register_hook(DistSamplerSeedHook())
|
| 96 |
+
|
| 97 |
+
shuffle_cfg = cfg.get('shuffle_cfg', None)
|
| 98 |
+
if shuffle_cfg is not None:
|
| 99 |
+
for data_loader in data_loaders:
|
| 100 |
+
runner.register_hook(ShufflePairedSamplesHook(data_loader, **shuffle_cfg))
|
| 101 |
+
|
| 102 |
+
# register eval hooks
|
| 103 |
+
if validate:
|
| 104 |
+
eval_cfg = cfg.get('evaluation', {})
|
| 105 |
+
eval_cfg['res_folder'] = os.path.join(cfg.work_dir, eval_cfg['res_folder'])
|
| 106 |
+
dataloader_setting = dict(
|
| 107 |
+
# samples_per_gpu=cfg.data.get('samples_per_gpu', {}),
|
| 108 |
+
samples_per_gpu=1,
|
| 109 |
+
workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
|
| 110 |
+
# cfg.gpus will be ignored if distributed
|
| 111 |
+
num_gpus=len(cfg.gpu_ids),
|
| 112 |
+
dist=distributed,
|
| 113 |
+
shuffle=False,
|
| 114 |
+
pin_memory=False,
|
| 115 |
+
)
|
| 116 |
+
dataloader_setting = dict(dataloader_setting,
|
| 117 |
+
**cfg.data.get('val_dataloader', {}))
|
| 118 |
+
val_dataloader = build_dataloader(val_dataset, **dataloader_setting)
|
| 119 |
+
eval_hook = DistEvalHook if distributed else EvalHook
|
| 120 |
+
runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
|
| 121 |
+
|
| 122 |
+
if cfg.resume_from:
|
| 123 |
+
runner.resume(cfg.resume_from)
|
| 124 |
+
elif cfg.load_from:
|
| 125 |
+
runner.load_checkpoint(cfg.load_from)
|
| 126 |
+
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
|
models/core/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
models/core/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (141 Bytes). View file
|
|
|
models/core/custom_hooks/__pycache__/shuffle_hooks.cpython-38.pyc
ADDED
|
Binary file (1.26 kB). View file
|
|
|
models/core/custom_hooks/shuffle_hooks.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmcv.runner import Hook
|
| 2 |
+
from mmpose.utils import get_root_logger
|
| 3 |
+
from torch.utils.data import DataLoader
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ShufflePairedSamplesHook(Hook):
|
| 7 |
+
"""Non-Distributed ShufflePairedSamples.
|
| 8 |
+
After each training epoch, run FewShotKeypointDataset.random_paired_samples()
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def __init__(self,
|
| 12 |
+
dataloader,
|
| 13 |
+
interval=1):
|
| 14 |
+
if not isinstance(dataloader, DataLoader):
|
| 15 |
+
raise TypeError(f'dataloader must be a pytorch DataLoader, '
|
| 16 |
+
f'but got {type(dataloader)}')
|
| 17 |
+
|
| 18 |
+
self.dataloader = dataloader
|
| 19 |
+
self.interval = interval
|
| 20 |
+
self.logger = get_root_logger()
|
| 21 |
+
|
| 22 |
+
def after_train_epoch(self, runner):
|
| 23 |
+
"""Called after every training epoch to evaluate the results."""
|
| 24 |
+
if not self.every_n_epochs(runner, self.interval):
|
| 25 |
+
return
|
| 26 |
+
# self.logger.info("Run random_paired_samples()")
|
| 27 |
+
# self.logger.info(f"Before: {self.dataloader.dataset.paired_samples[0]}")
|
| 28 |
+
self.dataloader.dataset.random_paired_samples()
|
| 29 |
+
# self.logger.info(f"After: {self.dataloader.dataset.paired_samples[0]}")
|
models/datasets/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .builder import * # noqa
|
| 2 |
+
from .datasets import * # noqa
|
| 3 |
+
from .pipelines import * # noqa
|
models/datasets/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (210 Bytes). View file
|
|
|
models/datasets/__pycache__/builder.cpython-38.pyc
ADDED
|
Binary file (1.92 kB). View file
|
|
|
models/datasets/builder.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mmcv.utils import build_from_cfg
|
| 2 |
+
from mmpose.datasets.builder import DATASETS
|
| 3 |
+
from mmpose.datasets.dataset_wrappers import RepeatDataset
|
| 4 |
+
from torch.utils.data.dataset import ConcatDataset
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def _concat_cfg(cfg):
|
| 8 |
+
replace = ['ann_file', 'img_prefix']
|
| 9 |
+
channels = ['num_joints', 'dataset_channel']
|
| 10 |
+
concat_cfg = []
|
| 11 |
+
for i in range(len(cfg['type'])):
|
| 12 |
+
cfg_tmp = cfg.deepcopy()
|
| 13 |
+
cfg_tmp['type'] = cfg['type'][i]
|
| 14 |
+
for item in replace:
|
| 15 |
+
assert item in cfg_tmp
|
| 16 |
+
assert len(cfg['type']) == len(cfg[item]), (cfg[item])
|
| 17 |
+
cfg_tmp[item] = cfg[item][i]
|
| 18 |
+
for item in channels:
|
| 19 |
+
assert item in cfg_tmp['data_cfg']
|
| 20 |
+
assert len(cfg['type']) == len(cfg['data_cfg'][item])
|
| 21 |
+
cfg_tmp['data_cfg'][item] = cfg['data_cfg'][item][i]
|
| 22 |
+
concat_cfg.append(cfg_tmp)
|
| 23 |
+
return concat_cfg
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _check_vaild(cfg):
|
| 27 |
+
replace = ['num_joints', 'dataset_channel']
|
| 28 |
+
if isinstance(cfg['data_cfg'][replace[0]], (list, tuple)):
|
| 29 |
+
for item in replace:
|
| 30 |
+
cfg['data_cfg'][item] = cfg['data_cfg'][item][0]
|
| 31 |
+
return cfg
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def build_dataset(cfg, default_args=None):
|
| 35 |
+
"""Build a dataset from config dict.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
cfg (dict): Config dict. It should at least contain the key "type".
|
| 39 |
+
default_args (dict, optional): Default initialization arguments.
|
| 40 |
+
Default: None.
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
Dataset: The constructed dataset.
|
| 44 |
+
"""
|
| 45 |
+
if isinstance(cfg['type'], (list, tuple)): # In training, type=TransformerPoseDataset
|
| 46 |
+
dataset = ConcatDataset(
|
| 47 |
+
[build_dataset(c, default_args) for c in _concat_cfg(cfg)])
|
| 48 |
+
elif cfg['type'] == 'RepeatDataset':
|
| 49 |
+
dataset = RepeatDataset(
|
| 50 |
+
build_dataset(cfg['dataset'], default_args), cfg['times'])
|
| 51 |
+
else:
|
| 52 |
+
cfg = _check_vaild(cfg)
|
| 53 |
+
dataset = build_from_cfg(cfg, DATASETS, default_args)
|
| 54 |
+
return dataset
|