Spaces:

matanru
/

CapeX

Runtime error

App Files Files Community

matanru commited on Jun 8, 2024

Commit

93b49a4

0 Parent(s):

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
.idea/workspace.xml +60 -0
LICENSE +203 -0
README.md +14 -0
app.py +442 -0
configs/1shot-swin-clip/base_split1_config.py +195 -0
configs/1shot-swin-clip/base_split2_config.py +195 -0
configs/1shot-swin-clip/base_split3_config.py +195 -0
configs/1shot-swin-clip/base_split4_config.py +195 -0
configs/1shot-swin-clip/base_split5_config.py +195 -0
configs/1shot-swin-clip/graph_split1_config.py +198 -0
configs/1shot-swin-clip/graph_split2_config.py +197 -0
configs/1shot-swin-clip/graph_split3_config.py +197 -0
configs/1shot-swin-clip/graph_split4_config.py +197 -0
configs/1shot-swin-clip/graph_split5_config.py +197 -0
configs/1shot-swin-gte/base_split1_config.py +195 -0
configs/1shot-swin-gte/base_split2_config.py +195 -0
configs/1shot-swin-gte/base_split3_config.py +195 -0
configs/1shot-swin-gte/base_split4_config.py +195 -0
configs/1shot-swin-gte/base_split5_config.py +195 -0
configs/1shot-swin-gte/graph_split1_config.py +199 -0
configs/1shot-swin-gte/graph_split2_config.py +197 -0
configs/1shot-swin-gte/graph_split3_config.py +197 -0
configs/1shot-swin-gte/graph_split4_config.py +197 -0
configs/1shot-swin-gte/graph_split5_config.py +197 -0
configs/_base_/datasets/ap10k.py +142 -0
configs/_base_/default_runtime.py +20 -0
configs/demo_b.py +191 -0
demo_text.py +212 -0
docker/Dockerfile +59 -0
environment.yml +201 -0
examples/animal.png +0 -0
examples/car.png +0 -0
examples/chair.png +0 -0
examples/person.png +0 -0
models/VERSION +1 -0
models/__init__.py +3 -0
models/__pycache__/__init__.cpython-38.pyc +0 -0
models/apis/__init__.py +5 -0
models/apis/__pycache__/__init__.cpython-38.pyc +0 -0
models/apis/__pycache__/train.cpython-38.pyc +0 -0
models/apis/train.py +126 -0
models/core/__init__.py +1 -0
models/core/__pycache__/__init__.cpython-38.pyc +0 -0
models/core/custom_hooks/__pycache__/shuffle_hooks.cpython-38.pyc +0 -0
models/core/custom_hooks/shuffle_hooks.py +29 -0
models/datasets/__init__.py +3 -0
models/datasets/__pycache__/__init__.cpython-38.pyc +0 -0
models/datasets/__pycache__/builder.cpython-38.pyc +0 -0
models/datasets/builder.py +54 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,60 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="AutoImportSettings">
+    <option name="autoReloadType" value="SELECTIVE" />
+  </component>
+  <component name="ChangeListManager">
+    <list default="true" id="1cdeceeb-6b27-4a56-9f44-8bb2d77c353f" name="Changes" comment="" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="ProjectColorInfo"><![CDATA[{
+  "associatedIndex": 4
+}]]></component>
+  <component name="ProjectId" id="2hKQwKx3zpbH4D8IcAn5ZJcn2HY" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "git-widget-placeholder": "main",
+    "last_opened_file_path": "/home/matanru/huggingface/CapeX",
+    "node.js.detected.package.eslint": "true",
+    "node.js.detected.package.tslint": "true",
+    "node.js.selected.package.eslint": "(autodetect)",
+    "node.js.selected.package.tslint": "(autodetect)",
+    "nodejs_package_manager_path": "npm",
+    "vue.rearranger.settings.migration": "true"
+  }
+}]]></component>
+  <component name="SharedIndexes">
+    <attachedChunks>
+      <set>
+        <option value="bundled-js-predefined-1d06a55b98c1-cb551a44b0f8-JavaScript-PY-242.10180.30" />
+        <option value="bundled-python-sdk-7efad6460ed6-db4a76ca2eac-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-242.10180.30" />
+      </set>
+    </attachedChunks>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="1cdeceeb-6b27-4a56-9f44-8bb2d77c353f" name="Changes" comment="" />
+      <created>1717340527309</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1717340527309</updated>
+      <workItem from="1717340528499" duration="17535000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+</project>

LICENSE ADDED Viewed

	@@ -0,0 +1,203 @@

+Copyright (c) 2022 SenseTime. All Rights Reserved.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2020 MMClassification Authors.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: CapeX
+emoji: 👁
+colorFrom: indigo
+colorTo: indigo
+sdk: gradio
+sdk_version: 4.36.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+python: 3.10.13
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,442 @@

+import spaces
+import argparse
+import random
+import os
+os.system('python setup.py develop')
+import gradio as gr
+import matplotlib
+import numpy as np
+import torch
+from PIL import ImageDraw, Image
+from matplotlib import pyplot as plt
+from mmcv import Config
+import json
+# def replace_line(file_name, line_num, text):
+#     lines = open(file_name, 'r').readlines()
+#     lines[line_num] = text
+#     out = open(file_name, 'w')
+#     out.writelines(lines)
+#     out.close()
+# def read_lines(file_name):
+#     lines = open(file_name, 'r').readlines()
+#     print(lines)
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/parallel/distributed.py", 7, "from mmengine import print_log\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/parallel/distributed.py", 8, "from mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmengine.utils import digit_version\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/parallel/registry.py", 3, 'from mmengine.registry import Registry\n')
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/fileio/io.py", 5, "from mmengine.utils import is_list_of\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/checkpoint.py", 23, "from mmengine.utils import digit_version, mkdir_or_exist\nfrom mmengine.utils.dl_utils import load_url\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/hook.py", 1, "from mmengine.registry import Registry\nfrom mmengine.utils import is_method_overridden\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/evaluation.py",11, "from mmengine.utils import is_seq_of\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/mlflow.py", 3, "from mmengine.utils.dl_utils import TORCH_VERSION\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/tensorboard.py", 4, "from mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmengine.utils import digit_version\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/text.py", 12, "from mmengine.utils import is_tuple_of, scandir\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/logger/wandb.py", 5, "from mmengine.utils import scandir\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/optimizer.py", 11, "from mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmcv.utils import IS_NPU_AVAILABLE\nfrom mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/hooks/optimizer.py", 14, "from mmengine.utils import digit_version\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/fp16_utils.py", 12, "from mmcv.utils import IS_NPU_AVAILABLE\nfrom mmengine.utils.dl_utils import TORCH_VERSION\nfrom mmengine.utils import digit_version\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/builder.py", 4, "from mmengine.registry import Registry\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/optimizer/builder.py", 7, "from mmcv.utils import IS_NPU_AVAILABLE\nfrom mmengine.registry import Registry, build_from_cfg\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/runner/optimizer/default_constructor.py", 8, "from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm\nfrom mmengine.registry import build_from_cfg\nfrom mmengine.utils import is_list_of\n")
+# def is_ipu_available() -> bool:
+#     try:
+#         import poptorch
+#         return poptorch.ipuHardwareIsAvailable()
+#     except ImportError:
+#         return False
+# IS_IPU_AVAILABLE = str(is_ipu_available())
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/device/ipu/__init__.py", 1, f'IS_IPU_AVAILABLE = {IS_IPU_AVAILABLE}\n')
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/device/scatter_gather.py", 4, "from mmengine.utils import deprecated_api_warning\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmcv/device/_functions.py", 5, "from mmengine.utils import deprecated_api_warning\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmpose/__init__.py", 1, "from mmengine.utils import digit_version\nfrom mmcv import parse_version_info\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmpose/__init__.py", 21, "import mmcv\nmmcv_version = digit_version(mmcv.__version__)\n")
+# replace_line("/usr/local/lib/python3.10/site-packages/mmpose/core/optimizers/builder.py", 3, "from mmengine.registry import Registry, build_from_cfg")
+from mmcv.runner import load_checkpoint
+from mmpose.core import wrap_fp16_model
+from mmpose.models import build_posenet
+from torchvision import transforms
+from demo_text import Resize_Pad
+from models import *
+import networkx as nx
+import matplotlib.pyplot as plt
+import ast
+import cv2
+import matplotlib
+# matplotlib.use('agg')
+def edges_prompt_to_list(prompt):
+    if prompt[0] != "[":
+        prompt = "[" + prompt
+    if prompt[-1] != "]":
+        prompt += "]"
+    return ast.literal_eval(prompt)
+def descriptions_prompt_to_list(prompt):
+    return prompt.split(',')
+# Function to visualize the graph
+def visualize_graph(node_descriptions, edges):
+    plt.close('all')
+    node_descriptions = descriptions_prompt_to_list(node_descriptions)
+    edges = edges_prompt_to_list(edges)
+    # Create an empty graph
+    G = nx.Graph()
+    G.clear()
+    # Add nodes with descriptions
+    for i, desc in enumerate(node_descriptions):
+        G.add_node(i, label=desc)
+    # Add edges
+    for edge in edges:
+        G.add_edge(edge[0], edge[1])
+    # Draw the graph
+    pos = nx.spring_layout(G)  # Define layout
+    labels = nx.get_node_attributes(G, 'label')  # Get labels
+    nx.draw(G, pos, with_labels=True, labels=labels, node_size=1500, node_color='skyblue', font_size=10, font_weight='bold', font_color='black')  # Draw nodes with labels
+    nx.draw_networkx_edges(G, pos, width=2, edge_color='gray')  # Draw edges
+    plt.title("Graph Visualization")  # Set title
+    plt.axis('off')  # Turn off axis
+    # plt.show()  # Show plot
+    # Image from plot
+    fig = plt.gcf()
+    # fig.tight_layout(pad=0)
+    # To remove the huge white borders
+    # plt.margins(0)
+    fig.canvas.draw()
+    image_from_plot = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+    image_from_plot = image_from_plot.reshape(fig.canvas.get_width_height()[::-1] + (3,))
+    plt.clf()
+    return image_from_plot
+checkpoint_path = ''
+def plot_query_results(query_img, query_w, skeleton, prediction, radius=6):
+    h, w, c = query_img.shape
+    prediction = prediction[-1].cpu().numpy() * h
+    # prediction = prediction.cpu().numpy() * h
+    query_img = (query_img - np.min(query_img)) / (
+            np.max(query_img) - np.min(query_img))
+    for id, (img, w, keypoint) in enumerate(zip([query_img],
+                                                [query_w],
+                                                [prediction])):
+        f, axes = plt.subplots()
+        plt.imshow(img)
+        for k in range(keypoint.shape[0]):
+            if w[k] > 0:
+                kp = keypoint[k, :2]
+                c = (1, 0, 0, 0.75) if w[k] == 1 else (0, 0, 1, 0.6)
+                patch = plt.Circle(kp, radius, color=c)
+                axes.add_patch(patch)
+                axes.text(kp[0], kp[1], k)
+                plt.draw()
+        for l, limb in enumerate(skeleton):
+            kp = keypoint[:, :2]
+            if l > len(COLORS) - 1:
+                c = [x / 255 for x in random.sample(range(0, 255), 3)]
+            else:
+                c = [x / 255 for x in COLORS[l]]
+            if w[limb[0]] > 0 and w[limb[1]] > 0:
+                patch = plt.Line2D([kp[limb[0], 0], kp[limb[1], 0]],
+                                   [kp[limb[0], 1], kp[limb[1], 1]],
+                                   linewidth=6, color=c, alpha=0.6)
+                axes.add_artist(patch)
+        plt.axis('off')  # command for hiding the axis.
+        plt.subplots_adjust(0, 0, 1, 1, 0, 0)
+        plt.margins(0)
+        fig = plt.gcf()
+        fig.tight_layout(pad=0)
+        return plt
+COLORS = [
+    [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
+    [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
+    [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
+    [255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]
+]
+model = None
+# @spaces.GPU(duration=30)
+# def estimate(model, data):
+#     with torch.no_grad():
+#         model_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+#         data["img_q"].to(device=model_device)
+#         data['target_weight_s'][0].to(device=model_device)
+#         print(f'img type: {data["img_q"].dtype}, target_weight type: {data["target_weight_s"][0].dtype}')
+#         model.to(model_device)
+#         model.eval()
+#         # return model(**data)
+#         return model(str(data))
+# @spaces.GPU(duration=30)
+def estimate(data):
+    global model
+    with torch.no_grad():
+        # model_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        # data["img_q"].to(device=model_device)
+        # data['target_weight_s'][0].to(device=model_device)
+        return model(data)
+# Custom JSON encoder to handle non-serializable objects
+class CustomEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super().default(obj)
+def process(query_img, node_descriptions, edges,
+            cfg_path='configs/1shot-swin-gte/graph_split1_config.py'):
+    global model
+    node_descriptions = descriptions_prompt_to_list(node_descriptions)
+    edges = edges_prompt_to_list(edges)
+    cfg = Config.fromfile(cfg_path)
+    kp_src_tensor = torch.zeros((len(node_descriptions), 2))
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        Resize_Pad(cfg.model.encoder_config.img_size,
+                   cfg.model.encoder_config.img_size)])
+    if len(edges) == 0:
+        edges = [(0, 0)]
+    #model_device = "cuda" if torch.cuda.is_available() else "cpu"
+    np_query = np.array(query_img)[:, :, ::-1].copy()
+    q_img = preprocess(np_query).flip(0)[None] #.to(model_device)
+    # Create heatmap from keypoints
+    genHeatMap = TopDownGenerateTargetFewShot()
+    data_cfg = cfg.data_cfg
+    data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size,
+                                       cfg.model.encoder_config.img_size])
+    data_cfg['joint_weights'] = None
+    data_cfg['use_different_joint_weights'] = False
+    kp_src_3d = torch.cat(
+        (kp_src_tensor, torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
+    kp_src_3d_weight = torch.cat(
+        (torch.ones_like(kp_src_tensor),
+         torch.zeros(kp_src_tensor.shape[0], 1)), dim=-1)
+    target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg,
+                                                                 kp_src_3d,
+                                                                 kp_src_3d_weight,
+                                                                 sigma=1)
+    target_s = torch.tensor(target_s).float()[None]
+    target_weight_s = torch.ones_like(
+        torch.tensor(target_weight_s).float()[None]) #.to(model_device)
+    data = {
+        'img_s': [0],
+        'img_q': q_img,
+        'target_s': [target_s],
+        'target_weight_s': [target_weight_s],
+        'target_q': None,
+        'target_weight_q': None,
+        'return_loss': False,
+        'img_metas': [{'sample_skeleton': [edges],
+                       'query_skeleton': edges,
+                       # 'sample_point_descriptions': np.array([node_descriptions]),
+                       'sample_point_descriptions': node_descriptions,
+                       'sample_joints_3d': [kp_src_3d],
+                       'query_joints_3d': kp_src_3d,
+                       'sample_center': [kp_src_tensor.mean(dim=0)],
+                       'query_center': kp_src_tensor.mean(dim=0),
+                       'sample_scale': [
+                           kp_src_tensor.max(dim=0)[0] -
+                           kp_src_tensor.min(dim=0)[0]],
+                       'query_scale': kp_src_tensor.max(dim=0)[0] -
+                                      kp_src_tensor.min(dim=0)[0],
+                       'sample_rotation': [0],
+                       'query_rotation': 0,
+                       'sample_bbox_score': [1],
+                       'query_bbox_score': 1,
+                       'query_image_file': '',
+                       'sample_image_file': [''],
+                       }]
+    }
+    # Load model
+    model = build_posenet(cfg.model)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, checkpoint_path, map_location='cpu')
+    #model.to(model_device)
+    #model.eval()
+    # with torch.no_grad():
+    #     outputs = model(**data)
+    data["img_q"] = data["img_q"].cpu().numpy().tolist()
+    data['target_weight_s'][0] = data['target_weight_s'][0].cpu().numpy().tolist()
+    data['target_s'][0] = data['target_s'][0].cpu().numpy().tolist()
+    data['img_metas'][0]['sample_joints_3d'][0] = data['img_metas'][0]['sample_joints_3d'][0].cpu().tolist()
+    data['img_metas'][0]['query_joints_3d'] = data['img_metas'][0]['query_joints_3d'].cpu().tolist()
+    data['img_metas'][0]['sample_center'][0] = data['img_metas'][0]['sample_center'][0].cpu().tolist()
+    data['img_metas'][0]['query_center'] = data['img_metas'][0]['query_center'].cpu().tolist()
+    data['img_metas'][0]['sample_scale'][0] = data['img_metas'][0]['sample_scale'][0].cpu().tolist()
+    data['img_metas'][0]['query_scale'] = data['img_metas'][0]['query_scale'].cpu().tolist()
+    # # data['img_metas'][0]['sample_point_descriptions'] = data['img_metas'][0]['sample_point_descriptions'].tolist()
+    #model.cuda()
+    model.eval()
+    # return model(**data)
+    # with torch.no_grad():
+    #     outputs = model(**data)
+    str_data = json.dumps(data, cls=CustomEncoder)
+    outputs = estimate(str_data)
+    #outputs = estimate(**data)
+    # visualize results
+    vis_q_weight = target_weight_s[0]
+    vis_q_image = q_img[0].detach().cpu().numpy().transpose(1, 2, 0)
+    out = plot_query_results(vis_q_image, vis_q_weight, edges, torch.tensor(outputs['points']).squeeze(0))
+    return out
+def update_examples(query_img, node_descriptions, edges):
+    return query_img, node_descriptions, edges
+with gr.Blocks() as demo:
+    state = gr.State({
+        'kp_src': [],
+        'skeleton': [],
+        'count': 0,
+        'color_idx': 0,
+        'prev_pt': None,
+        'prev_pt_idx': None,
+        'prev_clicked': None,
+        'point_descriptions': None,
+    })
+    gr.Markdown('''
+    # CapeX Demo
+    We present a novel category agnostic pose estimation approach that utilizes support text-graphs
+    (graphs with text on its nodes), instead of the conventional techniques that use support images.
+    By leveraging the abstraction power of text-graphs, CapeX showcases SOTA results on MP100 while dropping the need
+    of providing an annotated support image.
+    ### [Paper](https://arxiv.org/pdf/2406.00384) | [GitHub](https://github.com/matanr/capex)
+    ## Instructions
+    1. Explain using text the desired keypoints. Pleaser refer to the example for the right format.
+    2. Optionally provide a graph representing the connections between the keypoints. Pleaser refer to the example for the right format.
+    3. Upload an image of the object you want to pose to the query image.
+    4. Click **Evaluate** to pose the query image.
+    ''')
+    with gr.Row():
+        # Input block for node descriptions
+        node_descriptions = gr.Textbox(label="Node Descriptions (String separated by commas)", lines=5, type="text",
+                                     # value="left eye, right eye, nose, neck, root of tail, left shoulder, left elbow, "
+                                     #       "left front paw, right shoulder, right elbow, right front paw, left hip, "
+                                     #       "left knee, left back paw, right hip, right knee, right back paw"
+                                       value="left eye, nose, right eye"
+                                       )
+        # Input block for edges
+        edges = gr.Textbox(label="Edges (List of 2-valued lists representing connections)", lines=5, type="text",
+                                 # value="[[0, 1], [0, 2], [1, 2], [2, 3], [3, 4], [3, 5], [5, 6], [6, 7], [3, 8], "
+                                 #       "[8, 9], [9, 10], [4, 11], [11, 12], [12, 13], [4, 14], [14, 15], [15, 16]]"
+                           value="[[0,1], [1,2]]"
+                           )
+        def set_initial_text_graph():
+            text_graph = visualize_graph("left eye, nose, right eye", "[[0,1], [1,2]]")
+            return text_graph
+        text_graph = gr.Image(label="Text-graph visualization",
+                              value=set_initial_text_graph,
+                              type="pil", height=400, width=400)
+    with gr.Row():
+        query_img = gr.Image(label="Query Image",
+                             type="pil", height=400, width=400)
+    with gr.Row():
+        eval_btn = gr.Button(value="Evaluate")
+    with gr.Row():
+        output_img = gr.Plot(label="Output Image")
+    with gr.Row():
+        gr.Markdown("## Examples")
+    with gr.Row():
+        gr.Examples(
+            examples=[
+                ['examples/animal.png',
+                 "left eye, right eye, nose, neck, root of tail, left shoulder, left elbow, "
+                 "left front paw, right shoulder, right elbow, right front paw, left hip, "
+                 "left knee, left back paw, right hip, right knee, right back paw",
+                 "[[0, 1], [0, 2], [1, 2], [2, 3], [3, 4], [3, 5], [5, 6], [6, 7], [3, 8], [8, 9],"
+                 "[9, 10], [4, 11], [11, 12], [12, 13], [4, 14], [14, 15], [15, 16]]"
+                 ],
+                ['examples/person.png',
+                 "nose, left eye, right eye, left ear, right ear, left shoulder, right shoulder, left elbow, "
+                 "right elbow, left wrist, right wrist, left hip, right hip, left knee, right knee, left ankle, "
+                 "right ankle",
+                 "[[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7],"
+                 "[6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]]"
+                 ],
+                ['examples/chair.png',
+                 "left and front leg, right and front leg, right and back leg, left and back leg, "
+                 "left and front side of the seat, right and front side of the seat, right and back side of the seat, "
+                 "left and back side of the seat, top left side of the backseat, top right side of the backseat",
+                 "[[0, 4], [3, 7], [1, 5], [2, 6], [4, 5], [5, 6], [6, 7], [7, 4], [6, 7], [7, 8],[8, 9], [9, 6]]",
+                 ],
+                ['examples/car.png',
+                 "front and right wheel, front and left wheel, rear and right wheel, rear and left wheel, "
+                 "right headlight, left headlight, right taillight, left taillight, "
+                 "front and right side of the top, front and left side of the top, rear and right side of the top, "
+                 "rear and left side of the top",
+                 "[[0, 2], [1, 3], [0, 1], [2, 3], [8, 10], [9, 11], [8, 9], [10, 11], [4, 0], "
+                 "[4, 8], [4, 5], [5, 1], [5, 9], [6, 2], [6, 10], [7, 3], [7, 11], [6, 7]]"
+                 ]
+            ],
+            inputs=[query_img, node_descriptions, edges],
+            outputs=[query_img, node_descriptions, edges],
+            fn=update_examples,
+            run_on_click=True,
+        )
+    eval_btn.click(fn=process,
+                   inputs=[query_img, node_descriptions, edges],
+                   outputs=[output_img])
+    node_descriptions.change(visualize_graph, inputs=[node_descriptions, edges], outputs=[text_graph])
+    edges.input(visualize_graph, inputs=[node_descriptions, edges], outputs=[text_graph])
+    # visualize_button.click(fn=visualize_graph,
+    #                inputs=[node_descriptions, edges, state],
+    #                outputs=[text_graph, state])
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='CapeX Demo')
+    parser.add_argument('--checkpoint',
+                        help='checkpoint path',
+                        default='swin-gte-split1.pth')
+    args = parser.parse_args()
+    checkpoint_path = args.checkpoint
+    demo.launch()

configs/1shot-swin-clip/base_split1_config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained="ViT-B/32",
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-clip/base_split2_config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained="ViT-B/32",
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-clip/base_split3_config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained="ViT-B/32",
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-clip/base_split4_config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained="ViT-B/32",
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-clip/base_split5_config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained="ViT-B/32",
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-clip/graph_split1_config.py ADDED Viewed

	@@ -0,0 +1,198 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+# total_epochs = 1
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained="ViT-B/32",
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        # text_in_channels=768,
+        text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-clip/graph_split2_config.py ADDED Viewed

	@@ -0,0 +1,197 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained="ViT-B/32",
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        # text_in_channels=768,
+        text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-clip/graph_split3_config.py ADDED Viewed

	@@ -0,0 +1,197 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained="ViT-B/32",
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        # text_in_channels=768,
+        text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-clip/graph_split4_config.py ADDED Viewed

	@@ -0,0 +1,197 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained="ViT-B/32",
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        # text_in_channels=768,
+        text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-clip/graph_split5_config.py ADDED Viewed

	@@ -0,0 +1,197 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained="ViT-B/32",
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        # text_in_channels=768,
+        text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-gte/base_split1_config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-gte/base_split2_config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-gte/base_split3_config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-gte/base_split4_config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-gte/base_split5_config.py ADDED Viewed

	@@ -0,0 +1,195 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-gte/graph_split1_config.py ADDED Viewed

	@@ -0,0 +1,199 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+# total_epochs = 1
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained="swinv2_base",
+    #'pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=768,
+        # text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-gte/graph_split2_config.py ADDED Viewed

	@@ -0,0 +1,197 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=768,
+        # text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split2_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-gte/graph_split3_config.py ADDED Viewed

	@@ -0,0 +1,197 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=768,
+        # text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split3_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-gte/graph_split4_config.py ADDED Viewed

	@@ -0,0 +1,197 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=768,
+        # text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split4_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/1shot-swin-gte/graph_split5_config.py ADDED Viewed

	@@ -0,0 +1,197 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='pretrained/swinv2_small_1k_500k.pth',
+    text_pretrained='Alibaba-NLP/gte-base-en-v1.5',
+    finetune_text_pretrained=False,
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        img_in_channels=768,
+        text_in_channels=768,
+        # text_in_channels=512,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=16,
+    # samples_per_gpu=8,
+    # workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split5_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

configs/_base_/datasets/ap10k.py ADDED Viewed

	@@ -0,0 +1,142 @@

+dataset_info = dict(
+    dataset_name='ap10k',
+    paper_info=dict(
+        author='Yu, Hang and Xu, Yufei and Zhang, Jing and '
+        'Zhao, Wei and Guan, Ziyu and Tao, Dacheng',
+        title='AP-10K: A Benchmark for Animal Pose Estimation in the Wild',
+        container='35th Conference on Neural Information Processing Systems '
+        '(NeurIPS 2021) Track on Datasets and Bench-marks.',
+        year='2021',
+        homepage='https://github.com/AlexTheBad/AP-10K',
+    ),
+    keypoint_info={
+        0:
+        dict(
+            name='L_Eye', id=0, color=[0, 255, 0], type='upper', swap='R_Eye'),
+        1:
+        dict(
+            name='R_Eye',
+            id=1,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Eye'),
+        2:
+        dict(name='Nose', id=2, color=[51, 153, 255], type='upper', swap=''),
+        3:
+        dict(name='Neck', id=3, color=[51, 153, 255], type='upper', swap=''),
+        4:
+        dict(
+            name='Root of tail',
+            id=4,
+            color=[51, 153, 255],
+            type='lower',
+            swap=''),
+        5:
+        dict(
+            name='L_Shoulder',
+            id=5,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Shoulder'),
+        6:
+        dict(
+            name='L_Elbow',
+            id=6,
+            color=[51, 153, 255],
+            type='upper',
+            swap='R_Elbow'),
+        7:
+        dict(
+            name='L_F_Paw',
+            id=7,
+            color=[0, 255, 0],
+            type='upper',
+            swap='R_F_Paw'),
+        8:
+        dict(
+            name='R_Shoulder',
+            id=8,
+            color=[0, 255, 0],
+            type='upper',
+            swap='L_Shoulder'),
+        9:
+        dict(
+            name='R_Elbow',
+            id=9,
+            color=[255, 128, 0],
+            type='upper',
+            swap='L_Elbow'),
+        10:
+        dict(
+            name='R_F_Paw',
+            id=10,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_F_Paw'),
+        11:
+        dict(
+            name='L_Hip',
+            id=11,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Hip'),
+        12:
+        dict(
+            name='L_Knee',
+            id=12,
+            color=[255, 128, 0],
+            type='lower',
+            swap='R_Knee'),
+        13:
+        dict(
+            name='L_B_Paw',
+            id=13,
+            color=[0, 255, 0],
+            type='lower',
+            swap='R_B_Paw'),
+        14:
+        dict(
+            name='R_Hip', id=14, color=[0, 255, 0], type='lower',
+            swap='L_Hip'),
+        15:
+        dict(
+            name='R_Knee',
+            id=15,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_Knee'),
+        16:
+        dict(
+            name='R_B_Paw',
+            id=16,
+            color=[0, 255, 0],
+            type='lower',
+            swap='L_B_Paw'),
+    },
+    skeleton_info={
+        0: dict(link=('L_Eye', 'R_Eye'), id=0, color=[0, 0, 255]),
+        1: dict(link=('L_Eye', 'Nose'), id=1, color=[0, 0, 255]),
+        2: dict(link=('R_Eye', 'Nose'), id=2, color=[0, 0, 255]),
+        3: dict(link=('Nose', 'Neck'), id=3, color=[0, 255, 0]),
+        4: dict(link=('Neck', 'Root of tail'), id=4, color=[0, 255, 0]),
+        5: dict(link=('Neck', 'L_Shoulder'), id=5, color=[0, 255, 255]),
+        6: dict(link=('L_Shoulder', 'L_Elbow'), id=6, color=[0, 255, 255]),
+        7: dict(link=('L_Elbow', 'L_F_Paw'), id=6, color=[0, 255, 255]),
+        8: dict(link=('Neck', 'R_Shoulder'), id=7, color=[6, 156, 250]),
+        9: dict(link=('R_Shoulder', 'R_Elbow'), id=8, color=[6, 156, 250]),
+        10: dict(link=('R_Elbow', 'R_F_Paw'), id=9, color=[6, 156, 250]),
+        11: dict(link=('Root of tail', 'L_Hip'), id=10, color=[0, 255, 255]),
+        12: dict(link=('L_Hip', 'L_Knee'), id=11, color=[0, 255, 255]),
+        13: dict(link=('L_Knee', 'L_B_Paw'), id=12, color=[0, 255, 255]),
+        14: dict(link=('Root of tail', 'R_Hip'), id=13, color=[6, 156, 250]),
+        15: dict(link=('R_Hip', 'R_Knee'), id=14, color=[6, 156, 250]),
+        16: dict(link=('R_Knee', 'R_B_Paw'), id=15, color=[6, 156, 250]),
+    },
+    joint_weights=[
+        1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5,
+        1.5
+    ],
+    sigmas=[
+        0.025, 0.025, 0.026, 0.035, 0.035, 0.079, 0.072, 0.062, 0.079, 0.072,
+        0.062, 0.107, 0.087, 0.089, 0.107, 0.087, 0.089
+    ])

configs/_base_/default_runtime.py ADDED Viewed

	@@ -0,0 +1,20 @@

+checkpoint_config = dict(interval=10)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+        # dict(type='PaviLoggerHook') # for internal services
+    ])
+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+# disable opencv multithreading to avoid system being overloaded
+opencv_num_threads = 0
+# set multi-process start method as `fork` to speed up the training
+mp_start_method = 'fork'

configs/demo_b.py ADDED Viewed

	@@ -0,0 +1,191 @@

+log_level = 'INFO'
+load_from = None
+resume_from = None
+dist_params = dict(backend='nccl')
+workflow = [('train', 1)]
+checkpoint_config = dict(interval=20)
+evaluation = dict(
+    interval=25,
+    metric=['PCK', 'NME', 'AUC', 'EPE'],
+    key_indicator='PCK',
+    gpu_collect=True,
+    res_folder='')
+optimizer = dict(
+    type='Adam',
+    lr=1e-5,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=0.001,
+    step=[160, 180])
+total_epochs = 200
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+channel_cfg = dict(
+    num_output_channels=1,
+    dataset_joints=1,
+    dataset_channel=[
+        [
+            0,
+        ],
+    ],
+    inference_channel=[
+        0,
+    ],
+    max_kpt_num=100)
+# model settings
+model = dict(
+    type='PoseAnythingModel',
+    pretrained='swinv2_small',
+    encoder_config=dict(
+        type='SwinTransformerV2',
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        img_size=256,
+        upsample="bilinear"
+    ),
+    keypoint_head=dict(
+        type='PoseHead',
+        in_channels=768,
+        transformer=dict(
+            type='EncoderDecoder',
+            d_model=256,
+            nhead=8,
+            num_encoder_layers=3,
+            num_decoder_layers=3,
+            graph_decoder='pre',
+            dim_feedforward=768,
+            dropout=0.1,
+            similarity_proj_dim=256,
+            dynamic_proj_dim=128,
+            activation="relu",
+            normalize_before=False,
+            return_intermediate_dec=True),
+        share_kpt_branch=False,
+        num_decoder_layer=3,
+        with_heatmap_loss=True,
+        heatmap_loss_weight=2.0,
+        support_order_dropout=-1,
+        positional_encoding=dict(
+            type='SinePositionalEncoding', num_feats=128, normalize=True)),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=False,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[256, 256],
+    heatmap_size=[64, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'])
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=15,
+        scale_factor=0.15),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs', 'category_id', 'skeleton',
+        ]),
+]
+valid_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownAffineFewShot'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTargetFewShot', sigma=1),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs', 'category_id',
+            'skeleton',
+        ]),
+]
+test_pipeline = valid_pipeline
+data_root = 'data/mp100'
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=8,
+    train=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_train.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        pipeline=train_pipeline),
+    val=dict(
+        type='TransformerPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_val.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=100,
+        pipeline=valid_pipeline),
+    test=dict(
+        type='TestPoseDataset',
+        ann_file=f'{data_root}/annotations/mp100_split1_test.json',
+        img_prefix=f'{data_root}/images/',
+        # img_prefix=f'{data_root}',
+        data_cfg=data_cfg,
+        valid_class_ids=None,
+        max_kpt_num=channel_cfg['max_kpt_num'],
+        num_shots=1,
+        num_queries=15,
+        num_episodes=200,
+        pck_threshold_list=[0.05, 0.10, 0.15, 0.2, 0.25],
+        pipeline=test_pipeline),
+)
+vis_backends = [
+    dict(type='LocalVisBackend'),
+    dict(type='TensorboardVisBackend'),
+]
+visualizer = dict(
+    type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+shuffle_cfg = dict(interval=1)

demo_text.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import argparse
+import copy
+import os
+import pickle
+import random
+import cv2
+import numpy as np
+import string
+import torch
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.runner import load_checkpoint
+from mmpose.core import wrap_fp16_model
+from mmpose.models import build_posenet
+from torchvision import transforms
+from models import *
+import torchvision.transforms.functional as F
+from tools.visualization import plot_results, plot_query_results, plot_modified_query
+import ast
+import shutil
+COLORS = [
+    [255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0],
+    [85, 255, 0], [0, 255, 0], [0, 255, 85], [0, 255, 170], [0, 255, 255],
+    [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], [170, 0, 255],
+    [255, 0, 255], [255, 0, 170], [255, 0, 85], [255, 0, 0]]
+class Resize_Pad:
+    def __init__(self, w=256, h=256):
+        self.w = w
+        self.h = h
+    def __call__(self, image):
+        _, w_1, h_1 = image.shape
+        ratio_1 = w_1 / h_1
+        # check if the original and final aspect ratios are the same within a margin
+        if round(ratio_1, 2) != 1:
+            # padding to preserve aspect ratio
+            if ratio_1 > 1:  # Make the image higher
+                hp = int(w_1 - h_1)
+                hp = hp // 2
+                image = F.pad(image, (hp, 0, hp, 0), 0, "constant")
+                return F.resize(image, [self.h, self.w])
+            else:
+                wp = int(h_1 - w_1)
+                wp = wp // 2
+                image = F.pad(image, (0, wp, 0, wp), 0, "constant")
+                return F.resize(image, [self.h, self.w])
+        else:
+            return F.resize(image, [self.h, self.w])
+def transform_keypoints_to_pad_and_resize(keypoints, image_size):
+    trans_keypoints = keypoints.clone()
+    h, w = image_size[:2]
+    ratio_1 = w / h
+    if ratio_1 > 1:
+        # width is bigger than height - pad height
+        hp = int(w - h)
+        hp = hp // 2
+        trans_keypoints[:, 1] = keypoints[:, 1] + hp
+        trans_keypoints *= (256. / w)
+    else:
+        # height is bigger than width - pad width
+        wp = int(image_size[1] - image_size[0])
+        wp = wp // 2
+        trans_keypoints[:, 0] = keypoints[:, 0] + wp
+        trans_keypoints *= (256. / h)
+    return trans_keypoints
+def parse_args():
+    parser = argparse.ArgumentParser(description='Pose Anything Demo')
+    parser.add_argument('--support_points', help='support keypoints text descriptions')
+    parser.add_argument('--support_skeleton', help='list of keypoints skeleton')
+    parser.add_argument('--query', help='Image file')
+    parser.add_argument('--config', default=None, help='test config file path')
+    parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+    parser.add_argument('--outdir', default='output', help='checkpoint file')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+             'the inference speed')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        default={},
+        help='override some settings in the used config, the key-value pair '
+             'in xxx=yyy format will be merged into config file. For example, '
+             "'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
+    args = parser.parse_args()
+    return args
+def merge_configs(cfg1, cfg2):
+    # Merge cfg2 into cfg1
+    # Overwrite cfg1 if repeated, ignore if value is None.
+    cfg1 = {} if cfg1 is None else cfg1.copy()
+    cfg2 = {} if cfg2 is None else cfg2
+    for k, v in cfg2.items():
+        if v:
+            cfg1[k] = v
+    return cfg1
+def main():
+    random.seed(0)
+    np.random.seed(0)
+    torch.manual_seed(0)
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.data.test.test_mode = True
+    os.makedirs(args.outdir, exist_ok=True)
+    # Load data
+    point_descriptions = ast.literal_eval(args.support_points)
+    query_img = cv2.imread(args.query)
+    if query_img is None:
+        raise ValueError('Fail to read image')
+    # just a placeholder, we don't have input keypoints
+    kp_src = torch.zeros((len(point_descriptions), 2))
+    preprocess = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        Resize_Pad(cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size)])
+    if args.support_skeleton is not None:
+        skeleton = ast.literal_eval(args.support_skeleton)
+    if len(skeleton) == 0:
+        skeleton = [(0, 0)]
+    model_device = "cuda" if torch.cuda.is_available() else "cpu"
+    query_img = preprocess(query_img).flip(0)[None].to(model_device)
+    # Create heatmap from keypoints
+    genHeatMap = TopDownGenerateTargetFewShot()
+    data_cfg = cfg.data_cfg
+    data_cfg['image_size'] = np.array([cfg.model.encoder_config.img_size, cfg.model.encoder_config.img_size])
+    data_cfg['joint_weights'] = None
+    data_cfg['use_different_joint_weights'] = False
+    kp_src_3d = torch.concatenate((kp_src, torch.zeros(kp_src.shape[0], 1)), dim=-1)
+    kp_src_3d_weight = torch.concatenate((torch.ones_like(kp_src), torch.zeros(kp_src.shape[0], 1)), dim=-1)
+    # everything that is related to the support image is used as placeholder
+    target_s, target_weight_s = genHeatMap._msra_generate_target(data_cfg, kp_src_3d, kp_src_3d_weight, sigma=1)
+    target_s = torch.tensor(target_s).float()[None]
+    target_weight_s = torch.tensor(target_weight_s).float()[None].to(model_device)
+    data = {
+        'img_s': [0],
+        'img_q': query_img,
+        'target_s': [target_s],
+        'target_weight_s': [target_weight_s],
+        'target_q': None,
+        'target_weight_q': None,
+        'return_loss': False,
+        'img_metas': [{'sample_skeleton': [skeleton],
+                       'query_skeleton': skeleton,
+                       'sample_point_descriptions': np.array([point_descriptions]),
+                       'sample_joints_3d': [kp_src_3d],
+                       'query_joints_3d': kp_src_3d,
+                       'sample_center': [kp_src.mean(dim=0)],
+                       'query_center': kp_src.mean(dim=0),
+                       'sample_scale': [kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0]],
+                       'query_scale': kp_src.max(dim=0)[0] - kp_src.min(dim=0)[0],
+                       'sample_rotation': [0],
+                       'query_rotation': 0,
+                       'sample_bbox_score': [1],
+                       'query_bbox_score': 1,
+                       'query_image_file': '',
+                       'sample_image_file': [''],
+                       }]
+    }
+    # Load model
+    model = build_posenet(cfg.model)
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    model.to(model_device)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(**data)
+    # visualize results
+    vis_q_weight = target_weight_s[0]
+    vis_q_image = query_img[0].detach().cpu().numpy().transpose(1, 2, 0)
+    name_idx = plot_query_results(vis_q_image, vis_q_weight, skeleton, torch.tensor(outputs['points']).squeeze(0), out_dir=args.outdir)
+    shutil.copyfile(args.query, f'./{args.outdir}/{str(name_idx)}_query_in.png')
+if __name__ == '__main__':
+    main()

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,59 @@

+ARG PYTORCH="2.0.1"
+ARG CUDA="11.7"
+ARG CUDNN="8"
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
+ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
+ENV TZ=Asia/Kolkata DEBIAN_FRONTEND=noninteractive
+# To fix GPG key error when running apt-get update
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx\
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Install xtcocotools
+RUN pip install cython
+RUN pip install xtcocotools
+# Install MMEngine and MMCV
+RUN pip install openmim
+RUN mim install mmengine
+RUN mim install "mmpose==0.28.1"
+RUN mim install "mmcv-full==1.5.3"
+RUN pip install -U torchmetrics timm
+RUN pip install numpy scipy --upgrade
+RUN pip install future tensorboard
+# some other requirments
+RUN pip install git+https://github.com/openai/CLIP.git
+RUN pip install yapf==0.40.1
+RUN pip install transformers
+WORKDIR CapeX
+COPY models CapeX/models
+COPY configs CapeX/configs
+COPY pretrained CapeX/pretrained
+COPY requirements.txt CapeX/
+COPY tools CapeX/tools
+COPY setup.cfg CapeX/
+COPY setup.py CapeX/
+COPY test.py CapeX/
+COPY train.py CapeX/
+COPY README.md CapeX/
+COPY run_me.sh CapeX/
+RUN mkdir -p CapeX/data/mp100
+WORKDIR CapeX
+# Install MMPose
+RUN conda clean --all
+ENV FORCE_CUDA="1"
+RUN python setup.py develop
+#CMD ["bash"]
+#CMD ["/bin/bash", "-c", "chmod +x run_me.sh && ./run_me.sh"]

environment.yml ADDED Viewed

	@@ -0,0 +1,201 @@

+name: capex
+channels:
+  - pytorch
+  - nvidia
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=5.1
+  - blas=1.0
+  - ca-certificates=2024.3.11
+  - cuda-cudart=12.1.105
+  - cuda-cupti=12.1.105
+  - cuda-libraries=12.1.0
+  - cuda-nvrtc=12.1.105
+  - cuda-nvtx=12.1.105
+  - cuda-opencl=12.4.99
+  - cuda-runtime=12.1.0
+  - cudatoolkit=11.8.0
+  - filelock=3.13.1
+  - gmp=6.2.1
+  - gmpy2=2.1.2
+  - intel-openmp=2023.1.0
+  - jinja2=3.1.3
+  - ld_impl_linux-64=2.38
+  - libcublas=12.1.0.26
+  - libcufft=11.0.2.4
+  - libcufile=1.9.0.20
+  - libcurand=10.3.5.119
+  - libcusolver=11.4.4.55
+  - libcusparse=12.0.2.55
+  - libffi=3.4.4
+  - libgcc-ng=11.2.0
+  - libgomp=11.2.0
+  - libnpp=12.0.2.50
+  - libnvjitlink=12.1.105
+  - libnvjpeg=12.1.1.14
+  - libstdcxx-ng=11.2.0
+  - markupsafe=2.1.3
+  - mkl=2023.1.0
+  - mpc=1.1.0
+  - mpfr=4.0.2
+  - mpmath=1.3.0
+  - ncurses=6.4
+  - networkx=3.1
+  - openssl=3.0.13
+  - pip=23.3.1
+  - python=3.8.18
+  - pytorch-cuda=12.1
+  - pytorch-mutex=1.0
+  - readline=8.2
+  - setuptools=68.2.2
+  - sqlite=3.41.2
+  - sympy=1.12
+  - tbb=2021.8.0
+  - tk=8.6.12
+  - typing_extensions=4.9.0
+  - wheel=0.41.2
+  - xz=5.4.6
+  - zlib=1.2.13
+  - pip:
+      - absl-py==2.1.0
+      - addict==2.4.0
+      - aiofiles==23.2.1
+      - aiohttp==3.9.3
+      - aiosignal==1.3.1
+      - altair==5.3.0
+      - annotated-types==0.6.0
+      - antlr4-python3-runtime==4.9.3
+      - anyio==4.3.0
+      - async-timeout==4.0.3
+      - attrs==23.2.0
+      - cachetools==5.3.3
+      - certifi==2024.2.2
+      - charset-normalizer==3.3.2
+      - chumpy==0.70
+      - click==8.1.7
+      - git+https://github.com/openai/CLIP.git
+      - contourpy==1.1.1
+      - cycler==0.12.1
+      - cython==3.0.9
+      - dnspython==2.6.1
+      - email-validator==2.1.1
+      - exceptiongroup==1.2.1
+      - fastapi==0.111.0
+      - fastapi-cli==0.0.3
+      - ffmpy==0.3.2
+      - fonttools==4.49.0
+      - frozenlist==1.4.1
+      - fsspec==2024.2.0
+      - ftfy==6.2.0
+      - future==1.0.0
+      - google-auth==2.28.2
+      - google-auth-oauthlib==1.0.0
+      - gradio==4.31.0
+      - gradio-client==0.16.2
+      - grpcio==1.62.1
+      - h11==0.14.0
+      - httpcore==1.0.5
+      - httptools==0.6.1
+      - httpx==0.27.0
+      - huggingface-hub==0.21.4
+      - idna==3.6
+      - importlib-metadata==7.0.1
+      - importlib-resources==6.1.2
+      - joblib==1.4.0
+      - json-tricks==3.17.3
+      - jsonschema==4.22.0
+      - jsonschema-specifications==2023.12.1
+      - kiwisolver==1.4.5
+      - kornia==0.7.2
+      - kornia-rs==0.1.3
+      - lightning-utilities==0.11.2
+      - markdown==3.5.2
+      - markdown-it-py==3.0.0
+      - matplotlib==3.7.5
+      - mdurl==0.1.2
+      - mmcv-full==1.6.2
+      - mmpose==0.29.0
+      - multidict==6.0.5
+      - munkres==1.1.4
+      - numpy==1.24.4
+      - nvidia-cublas-cu12==12.1.3.1
+      - nvidia-cuda-cupti-cu12==12.1.105
+      - nvidia-cuda-nvrtc-cu12==12.1.105
+      - nvidia-cuda-runtime-cu12==12.1.105
+      - nvidia-cudnn-cu12==8.9.2.26
+      - nvidia-cufft-cu12==11.0.2.54
+      - nvidia-curand-cu12==10.3.2.106
+      - nvidia-cusolver-cu12==11.4.5.107
+      - nvidia-cusparse-cu12==12.1.0.106
+      - nvidia-nccl-cu12==2.19.3
+      - nvidia-nvjitlink-cu12==12.4.99
+      - nvidia-nvtx-cu12==12.1.105
+      - oauthlib==3.2.2
+      - omegaconf==2.3.0
+      - opencv-python==4.9.0.80
+      - orjson==3.10.3
+      - packaging==23.2
+      - pandas==2.0.3
+      - pillow==10.2.0
+      - pkgutil-resolve-name==1.3.10
+      - platformdirs==4.2.0
+      - protobuf==4.25.3
+      - pyasn1==0.5.1
+      - pyasn1-modules==0.3.0
+      - pydantic==2.7.1
+      - pydantic-core==2.18.2
+      - pydub==0.25.1
+      - pygments==2.18.0
+      - pyparsing==3.1.2
+      - python-dateutil==2.9.0.post0
+      - python-dotenv==1.0.1
+      - python-multipart==0.0.9
+      - pytorch-lightning==2.2.1
+      - pytz==2024.1
+      - pyyaml==6.0.1
+      - referencing==0.35.1
+      - regex==2023.12.25
+      - requests==2.31.0
+      - requests-oauthlib==1.4.0
+      - rich==13.7.1
+      - rpds-py==0.18.1
+      - rsa==4.9
+      - ruff==0.4.4
+      - safetensors==0.4.2
+      - scikit-learn==1.3.2
+      - scipy==1.10.1
+      - semantic-version==2.10.0
+      - sentencepiece==0.2.0
+      - shellingham==1.5.4
+      - six==1.16.0
+      - sniffio==1.3.1
+      - starlette==0.37.2
+      - tensorboard==2.14.0
+      - tensorboard-data-server==0.7.2
+      - threadpoolctl==3.4.0
+      - timm==0.4.12
+      - tokenizers==0.15.2
+      - tomli==2.0.1
+      - tomlkit==0.12.0
+      - toolz==0.12.1
+      - torch==2.2.1
+      - torchmetrics==1.3.2
+      - torchvision==0.17.1
+      - tqdm==4.66.2
+      - transformers==4.38.2
+      - triton==2.2.0
+      - typer==0.12.3
+      - tzdata==2024.1
+      - ujson==5.9.0
+      - urllib3==2.2.1
+      - uvicorn==0.29.0
+      - uvloop==0.19.0
+      - watchfiles==0.21.0
+      - wcwidth==0.2.13
+      - websockets==11.0.3
+      - werkzeug==3.0.1
+      - xtcocotools==1.14.3
+      - yapf==0.40.1
+      - yarl==1.9.4
+      - zipp==3.17.0

examples/animal.png ADDED Viewed

examples/car.png ADDED Viewed

examples/chair.png ADDED Viewed

examples/person.png ADDED Viewed

models/VERSION ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.2.0

models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .core import *  # noqa
+from .datasets import *  # noqa
+from .models import *  # noqa

models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (195 Bytes). View file

models/apis/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .train import train_model
+__all__ = [
+    'train_model'
+]

models/apis/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (206 Bytes). View file

models/apis/__pycache__/train.cpython-38.pyc ADDED Viewed

Binary file (3.15 kB). View file

models/apis/train.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+import torch
+from models.core.custom_hooks.shuffle_hooks import ShufflePairedSamplesHook
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (DistSamplerSeedHook, EpochBasedRunner, OptimizerHook,
+                         build_optimizer)
+from mmpose.core import DistEvalHook, EvalHook, Fp16OptimizerHook
+from mmpose.datasets import build_dataloader
+from mmpose.utils import get_root_logger
+def train_model(model,
+                dataset,
+                val_dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """Train model entry function.
+    Args:
+        model (nn.Module): The model to be trained.
+        dataset (Dataset): Train dataset.
+        cfg (dict): The config dict for training.
+        distributed (bool): Whether to use distributed training.
+            Default: False.
+        validate (bool): Whether to do evaluation. Default: False.
+        timestamp (str | None): Local time for runner. Default: None.
+        meta (dict | None): Meta dict to record some important information.
+            Default: None
+    """
+    logger = get_root_logger(cfg.log_level)
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    dataloader_setting = dict(
+        samples_per_gpu=cfg.data.get('samples_per_gpu', {}),
+        workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
+        # cfg.gpus will be ignored if distributed
+        num_gpus=len(cfg.gpu_ids),
+        dist=distributed,
+        seed=cfg.seed,
+        pin_memory=False,
+    )
+    dataloader_setting = dict(dataloader_setting,
+                              **cfg.data.get('train_dataloader', {}))
+    data_loaders = [
+        build_dataloader(ds, **dataloader_setting) for ds in dataset
+    ]
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters',
+                                         False)  # NOTE: True has been modified to False for faster training.
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+    else:
+        model = MMDataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+    runner = EpochBasedRunner(
+        model,
+        optimizer=optimizer,
+        work_dir=cfg.work_dir,
+        logger=logger,
+        meta=meta)
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    if distributed:
+        runner.register_hook(DistSamplerSeedHook())
+    shuffle_cfg = cfg.get('shuffle_cfg', None)
+    if shuffle_cfg is not None:
+        for data_loader in data_loaders:
+            runner.register_hook(ShufflePairedSamplesHook(data_loader, **shuffle_cfg))
+    # register eval hooks
+    if validate:
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['res_folder'] = os.path.join(cfg.work_dir, eval_cfg['res_folder'])
+        dataloader_setting = dict(
+            # samples_per_gpu=cfg.data.get('samples_per_gpu', {}),
+            samples_per_gpu=1,
+            workers_per_gpu=cfg.data.get('workers_per_gpu', {}),
+            # cfg.gpus will be ignored if distributed
+            num_gpus=len(cfg.gpu_ids),
+            dist=distributed,
+            shuffle=False,
+            pin_memory=False,
+        )
+        dataloader_setting = dict(dataloader_setting,
+                                  **cfg.data.get('val_dataloader', {}))
+        val_dataloader = build_dataloader(val_dataset, **dataloader_setting)
+        eval_hook = DistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow, cfg.total_epochs)

models/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

models/core/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (141 Bytes). View file

models/core/custom_hooks/__pycache__/shuffle_hooks.cpython-38.pyc ADDED Viewed

Binary file (1.26 kB). View file

models/core/custom_hooks/shuffle_hooks.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from mmcv.runner import Hook
+from mmpose.utils import get_root_logger
+from torch.utils.data import DataLoader
+class ShufflePairedSamplesHook(Hook):
+    """Non-Distributed ShufflePairedSamples.
+    After each training epoch, run FewShotKeypointDataset.random_paired_samples()
+    """
+    def __init__(self,
+                 dataloader,
+                 interval=1):
+        if not isinstance(dataloader, DataLoader):
+            raise TypeError(f'dataloader must be a pytorch DataLoader, '
+                            f'but got {type(dataloader)}')
+        self.dataloader = dataloader
+        self.interval = interval
+        self.logger = get_root_logger()
+    def after_train_epoch(self, runner):
+        """Called after every training epoch to evaluate the results."""
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        # self.logger.info("Run random_paired_samples()")
+        # self.logger.info(f"Before: {self.dataloader.dataset.paired_samples[0]}")
+        self.dataloader.dataset.random_paired_samples()
+        # self.logger.info(f"After: {self.dataloader.dataset.paired_samples[0]}")

models/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .builder import *  # noqa
+from .datasets import *  # noqa
+from .pipelines import *  # noqa

models/datasets/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (210 Bytes). View file

models/datasets/__pycache__/builder.cpython-38.pyc ADDED Viewed

Binary file (1.92 kB). View file

models/datasets/builder.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from mmcv.utils import build_from_cfg
+from mmpose.datasets.builder import DATASETS
+from mmpose.datasets.dataset_wrappers import RepeatDataset
+from torch.utils.data.dataset import ConcatDataset
+def _concat_cfg(cfg):
+    replace = ['ann_file', 'img_prefix']
+    channels = ['num_joints', 'dataset_channel']
+    concat_cfg = []
+    for i in range(len(cfg['type'])):
+        cfg_tmp = cfg.deepcopy()
+        cfg_tmp['type'] = cfg['type'][i]
+        for item in replace:
+            assert item in cfg_tmp
+            assert len(cfg['type']) == len(cfg[item]), (cfg[item])
+            cfg_tmp[item] = cfg[item][i]
+        for item in channels:
+            assert item in cfg_tmp['data_cfg']
+            assert len(cfg['type']) == len(cfg['data_cfg'][item])
+            cfg_tmp['data_cfg'][item] = cfg['data_cfg'][item][i]
+        concat_cfg.append(cfg_tmp)
+    return concat_cfg
+def _check_vaild(cfg):
+    replace = ['num_joints', 'dataset_channel']
+    if isinstance(cfg['data_cfg'][replace[0]], (list, tuple)):
+        for item in replace:
+            cfg['data_cfg'][item] = cfg['data_cfg'][item][0]
+    return cfg
+def build_dataset(cfg, default_args=None):
+    """Build a dataset from config dict.
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        default_args (dict, optional): Default initialization arguments.
+            Default: None.
+    Returns:
+        Dataset: The constructed dataset.
+    """
+    if isinstance(cfg['type'], (list, tuple)):  # In training, type=TransformerPoseDataset
+        dataset = ConcatDataset(
+            [build_dataset(c, default_args) for c in _concat_cfg(cfg)])
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    else:
+        cfg = _check_vaild(cfg)
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+    return dataset